File: filter_counts.c

package info (click to toggle)
chip-seq 1.5.5-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, forky, sid, trixie
  • size: 21,320 kB
  • sloc: ansic: 10,053; perl: 1,493; makefile: 103; sh: 91
file content (295 lines) | stat: -rwxr-xr-x 7,639 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
/*
  filter_counts.c

  Filter Feature read counts that occur within user-defined regions
  of interest such as Repeat Mask regions. 
  If the -r option is set, the prpgram retains read counts that occur
  within user-defined regions of interest.
  
  # Arguments:
  # Feature name of the user-defined regions of interest (e.g. RMSK) 


  Giovanna Ambrosini, EPFL/ISREC, Giovanna.Ambrosini@epfl.ch

  Copyright (c) 2014 EPFL and Swiss Institute of Bioinformatics.

  This program is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <http://www.gnu.org/licenses/>.

*/
/*
#define DEBUG 
*/
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <ctype.h>
#ifdef DEBUG
#include <mcheck.h>
#endif

#include "version.h"

/*#define BUF_SIZE 4096 */
#define BUF_SIZE 8192
#define LINE_SIZE 1024
#define FT_MAX  64
#define SEQ_ID  32
#define POS_MAX 16
#define CNT_MAX 16
#define EXT_MAX 256

typedef struct _options_t {
  int help;
  int debug;
  int retain;
} options_t;

static options_t options;

char *Feature = NULL;

int
process_sga(FILE *input, char *iFile) 
{
  unsigned long pos; 
  int cnt;
  char *s, *res, *buf;
  size_t bLen = LINE_SIZE;
  int prt = 0;
  char annotation[EXT_MAX];

  if ((s = malloc(bLen * sizeof(char))) == NULL) {
    perror("process_sga: malloc");
    exit(1);
  }
  if (options.debug)
    fprintf(stderr, "Processing file %s\n", iFile);
#ifdef DEBUG
  int c = 1; 
#endif
  while ((res = fgets(s, (int) bLen, input)) != NULL) {
    size_t cLen = strlen(s);
    char seq_id[SEQ_ID] = "";
    char ft[FT_MAX] = ""; 
    char position[POS_MAX] = ""; 
    char count[CNT_MAX] = ""; 
    char strand = '\0';
    char ext[EXT_MAX]; 
    unsigned int i = 0;

    memset(ext, 0, (size_t)EXT_MAX);
    while (cLen + 1 == bLen && s[cLen - 1] != '\n') {
      bLen *= 2;
      if ((s = realloc(s, bLen)) == NULL) {
        perror("process_file: realloc");
        exit(1);
      }
      res = fgets(s + cLen, (int) (bLen - cLen), input);
      cLen = strlen(s);
    }
    if (s[cLen - 1] == '\n')
      s[cLen - 1] = 0;

    buf = s;
    /* Get SGA fields */
    /* SEQ ID */
    while (*buf != 0 && !isspace(*buf)) {
      if (i >= SEQ_ID) {
        fprintf(stderr, "Seq ID is too long \"%s\" \n", buf);
        exit(1);
      }
      seq_id[i++] = *buf++;
    }
    while (isspace(*buf))
      buf++;
    /* FEATURE */
    i = 0;
    while (*buf != 0 && !isspace(*buf)) {
      if (i >= FT_MAX) {
        fprintf(stderr, "Feature is too long \"%s\" \n", buf);
        exit(1);
      }
      ft[i++] = *buf++;
    }
    while (isspace(*buf))
      buf++;
    /* Position */
    i = 0;
    while (isdigit(*buf)) { 
      if (i >= POS_MAX) {
        fprintf(stderr, "Position is too large \"%s\" \n", buf);
        exit(1);
      }
      position[i++] = *buf++;
    }
    position[i] = 0;
    pos = (unsigned long)atol(position);
    while (isspace(*buf))
      buf++;
    /* Strand */
    strand = *buf++;
    while (isspace(*buf))
      buf++;
    /* Counts */
    i = 0;
    while (isdigit(*buf)) { 
      if (i >= CNT_MAX) {
        fprintf(stderr, "Count is too large \"%s\" \n", buf);
        exit(1);
      }
      count[i++] = *buf++;
    }
    count[i] = 0;
    cnt = atoi(count);
    while (isspace(*buf))
      buf++;
    /* SGA Extension */
    i = 0;
    while (*buf != 0) {
      if (i >= EXT_MAX) {
        fprintf(stderr, "Extension is too long \"%s\" \n", buf);
        exit(1);
      }
      ext[i++] = *buf++;
    }
#ifdef DEBUG
  fprintf(stderr, " [%d] seq ID: %s   Feat: %s (%c)  Pos: %lu  Cnts: %d Ext: %s\n", c++, seq_id, ft, strand, pos, cnt, ext);
#endif
    if (!strcmp(ft, Feature)) {
      if (strand == '+') {
        prt++;
        strcpy(annotation, ext);
      } else if (strand == '-') {
        prt--;
      }
    } else {
      if (options.retain) {
        if (prt >= 1) {
          if (!strcmp(ext, "")) {
            printf("%s\t%s\t%lu\t%c\t%d\t%s\n", seq_id, ft , pos, strand, cnt, annotation);
          } else {
            printf("%s\t%s\t%lu\t%c\t%d\t%s\t%s\n", seq_id, ft, pos, strand, cnt, ext, annotation);
          }
        }
      } else {
        if (prt < 1) {
          if (!strcmp(ext, "")) {
            printf("%s\t%s\t%lu\t%c\t%d\n", seq_id, ft , pos, strand, cnt);
          } else {
            printf("%s\t%s\t%lu\t%c\t%d\t%s\n", seq_id, ft, pos, strand, cnt, ext);
          }
        }
      }
    }
  } /* End of While */
  free(s);
  return 0;
}

int
main(int argc, char *argv[])
{
#ifdef DEBUG
  mcheck(NULL);
  mtrace();
#endif
  FILE *input;
  int free_flag = 0;

  while (1) {
    int c = getopt(argc, argv, "f:dhr");
    if (c == -1)
      break;
    switch (c) {
      case 'f':
        Feature = optarg;
        break;
      case 'd':
        options.debug = 1;
        break;
      case 'h':
        options.help = 1;
        break;
      case 'r':
        options.retain = 1;
        break;
      default:
        printf ("?? getopt returned character code 0%o ??\n", c);
    }
  }
  if (optind > argc || options.help == 1) {
    fprintf(stderr, "Usage: %s [options] [<] <SGA File>\n"
             "      - version %s\n"
             "      where options are:\n"
	     "  \t\t -h     Show this help text\n"
	     "  \t\t -d     Print debug information\n"
	     "  \t\t -f     Feature defining the user-defined regions of interest (default=RMSK)\n"
	     "  \t\t -r     Retain Mode on\n"
	     "\n\tThe program reads a ChIP-seq data file (or from stdin [<]) in SGA format (<SGA File>)\n"
	     "\tand filters out all read counts that occur within user-defined regions of interest, such\n"
             "\tas Repeat Mask regions (feature=RMSK).\n"
             "\tIf 'Retain Mode' is on, it only retains those lines that are within the user-defined regions,\n"
             "\tin which case the annotation of the user-defined regions is added to the output SGA file.\n\n",
	     argv[0], VERSION);
      return 1;
  }
  if (argc > optind) {
      if(!strcmp(argv[optind],"-")) {
          input = stdin;
      } else {
          input = fopen(argv[optind], "r");
          if (NULL == input) {
              fprintf(stderr, "Unable to open '%s': %s(%d)\n",
                  argv[optind], strerror(errno), errno);
             exit(EXIT_FAILURE);
          }
          if (options.debug)
             fprintf(stderr, "Processing file %s\n", argv[optind]);
      }
  } else {
      input = stdin;
  }
  if (options.debug) {
    fprintf(stderr, " Feature name: %s\n\n", Feature);
  }
  /* Process Feature */
  if (Feature == NULL) {
    if ((Feature = malloc(5 * sizeof(char))) == NULL) {
      perror("process_sga: malloc Feature");
      exit(1);
    }
    strcpy(Feature, "RMSK");
    free_flag = 1;
  } else {
    char *s = Feature;
    int i = 0;
    while (*s != 0  && !isspace(*s)) {
      if (i >= FT_MAX) {
        fprintf(stderr, "Feature Name too long \"%s\" \n", Feature);
        return 1;
      }
      s++;
    }
  }
  if (process_sga(input, argv[optind++]) != 0) {
    return 1;
  }
  if (free_flag)
    free(Feature);
  return 0;
}