File: gene-reader.cc

package info (click to toggle)
transtermhp 2.09-5
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, forky, sid, trixie
  • size: 536 kB
  • sloc: cpp: 4,665; python: 294; makefile: 203; sh: 115
file content (173 lines) | stat: -rw-r--r-- 4,322 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL
 * License version 2.0. See file LICENSE.txt for more details. */

#include <iostream>
#include <string>
#include <sstream>
#include <cassert>
#include "gene-reader.h"
#include "util.h"

// return the right kind of annotation reader given teh extn of the filename
// if none, return 0
GeneReader *
gene_reader_factory(const string & fn)
{
    string extn = fn.substr(fn.rfind('.')+1);
    if(extn == "ptt") return new PTTReader(fn);
    if(extn == "coords" || extn == "crd") return new CoordsReader(fn);
    return 0;
}


// make a new PTTReader.
PTTReader::PTTReader(const string & fn)
    : _in(fn.c_str())
{
    unsigned dotpos = fn.rfind('.');
    // remove the extn
    _id = (dotpos == 0)?fn:fn.substr(0, dotpos);

    // remove leading paths .../
    _id = _id.substr(_id.rfind('/')+1);
}


// read the genes and put them into the genome
bool
PTTReader::read_genes(Genome & g)
{
    string line;
    assert(good());

    // read the header
    getline(_in, line);
    getline(_in, line);
    getline(_in, line);

    while(getline(_in, line))
    {
        string loc, strand, pid, gene, syn, code, cog, desc, name;
        int len;

        istringstream iss(line);
        iss >> loc >> strand >> len >> pid >> gene >> syn >> code >> cog;
        getline(iss, desc);
        desc = trim_front(desc);

        if(gene != "-") name = gene;
        else if(syn != "-") name = syn;
        else if(pid != "-") name = pid;
        else name = "UNK";

        vector<string> locvec;
        split(loc, '.', locvec);
        if(locvec.size() != 3)
        {
            cerr << "Bad location format: " << loc << endl;
            exit(3);
        }

        unsigned long start, end;
        start = atol(locvec[0].c_str());
        end = atol(locvec[2].c_str());

        // make sure that the coords are given as left..right where left <= right
        if(end < start)
        {
            cerr << "WARNING: TransTerm does not handle genomes with genes that wrap around." 
                 << endl;
            continue;
        }

        if(strand == "-") 
        {
            swap(start, end);
        }
        else if(strand != "+")
        {
            cerr << "Unknown strand value: " << strand << endl;
            exit(3);
        }

        Seq * s = chrom_for_id(g, _id);
        
        if(s)
        {
            if(start > s->length || end > s->length || start < 0 || end < 0)
            {
                cerr << "Bad gene coordinates: " << start << " - " << end << endl;
                exit(3);
            }

            s->genes.push_back(
                new Region(name, s, s->dna + start - 1, s->dna + end - 1, desc));
        }
        else
        {
            cerr << "Can't find seq for id: " << _id << endl;
            exit(3);
        }
    }
    return true;
}


// make a new reader for .coords file
CoordsReader::CoordsReader(const string & fn)
    : _in(fn.c_str())
{
}


// Read the gene cords file a stream formated as:
//      gene_name   start   end chrom_id
// where if start > end the gene runs on the other strand.
// Start and end are 1-based
bool
CoordsReader::read_genes(Genome & g)
{
    assert(good());
    Seq * s;
    string line;

    while(getline(_in, line))
    {
        string name, chid;
        unsigned long startidx, endidx;

        istringstream iss(line);
        chid = "";
        iss >> name >> startidx >> endidx >> chid;

        // if there was no chid, we assume that its b/c we are missing
        // the gene names.
        if(chid == "")
        {
            istringstream iss(line);
            iss >> startidx >> endidx >> chid;
            name = "UNK";
        }

        s = chrom_for_id(g, chid);
        if(s)
        {
            if(startidx > s->length || endidx > s->length ||
               startidx <= 0 || endidx <= 0)
            {
                cerr << "Bad gene coordinates: " << startidx << " .. " 
                     << endidx << endl;
                exit(3);
            }

            s->genes.push_back(
                new Region(name, s, s->dna + startidx - 1, s->dna + endidx - 1));
        }
        else
        {
            cerr << "Unknown chromosome id: " << chid << endl;
            exit(3);
        }
    }
    return true;
}