1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
|
/*
* Pair a fastq file using a quick index.
*
* We have two files, and each file holds sequence data. Each data element is represented by only four, and exactly four
* lines:
*
* @id1/1 <- the sequence identifier always begins with an @ sign and then the identifier is the string upto the first whitespace. The identifier must be unique per sequence in a file
* ATGATC.... <- the DNA sequence
* + <- a line that begins with the + sign and may also have the identifier (but doesn't need to!)
* !%$#^@ <- an ascii representation of the "quality" of the sequence. The quality is a number between 0 and 100 and this is the chr of that number +33
*
* In the two files, there should be some sequences that are related, and they are denoted either as 1/2, forward/reverse, or just by having the whole id the same
*
* @id1/1 pairs with @id1/2
*
* or
*
* @id1/f pairs with @id1/r
*
* or
*
* @id1 in file 1 pairs with @id1 in file 2
*
* We read the file and make a hash of the ID without the /1 or /f and the position of that id in the file (using tell)
* Then we read the second file and check to see if we have a matching sequence. If we do, we print both sequences
* one to each file, and we set the "printed" boolean to true in our data structure corresponding to that ID.
*
* Finally, we read through our data structure and print out any sequences that we have not seen before.
*
* Note that to print out the sequences we seek to the position we recorded and print four lines.
*
*/
#include "fastq_pair.h"
#include "robstr.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int pair_files(char *left_fn, char *right_fn, struct options *opt) {
// our hash for the fastq ids.
struct idloc **ids;
ids = malloc(sizeof(*ids) * opt->tablesize);
// if we are not able to allocate the memory for this, there is no point continuing!
if (ids == NULL) {
fprintf(stderr, "We cannot allocate the memory for a table size of %d. Please try a smaller value for -t\n", opt->tablesize);
exit(-1);
}
FILE *lfp;
char *line = malloc(sizeof(char) * MAXLINELEN + 1);
if ((lfp = fopen(left_fn, "r")) == NULL) {
fprintf(stderr, "Can't open file %s\n", left_fn);
exit(1);
}
char *aline; /* this variable is not used, it suppresses a compiler warning */
long int nextposition = 0;
/*
* Read the first file and make an index of that file.
*/
while ((aline = fgets(line, MAXLINELEN, lfp)) != NULL) {
struct idloc *newid;
newid = (struct idloc *) malloc(sizeof(*newid));
if (newid == NULL) {
fprintf(stderr, "Can't allocate memory for new ID pointer\n");
return 0;
}
line[strcspn(line, "\n")] = '\0';
line[strcspn(line, " ")] = '\0';
/*
* Figure out what the match mechanism is. We have three examples so
* i. using /1 and /2
* ii. using /f and /r
* iii. just having the whole name
*
* If there is a /1 or /2 in the file name, we set that part to null so the string is only up
* to before the / and use that to store the location.
*/
char lastchar = line[strlen(line)-1];
char lastbutone = line[strlen(line)-2];
if ('/' == lastbutone || '_' == lastbutone || '.' == lastbutone)
if ('1' == lastchar || '2' == lastchar || 'f' == lastchar || 'r' == lastchar)
line[strlen(line)-1] = '\0';
if (opt->verbose)
fprintf(stderr, "ID is |%s|\n", line);
newid->id = dupstr(line);
newid->pos = nextposition;
newid->printed = false;
newid->next = NULL;
unsigned hashval = hash(newid->id) % opt->tablesize;
newid->next = ids[hashval];
ids[hashval] = newid;
/* read the next three lines and ignore them: sequence, header, and quality */
for (int i=0; i<3; i++)
aline = fgets(line, MAXLINELEN, lfp);
nextposition = ftell(lfp);
}
/*
* Now just print all the id lines and their positions
*/
if (opt->print_table_counts) {
fprintf(stdout, "Bucket sizes\n");
for (int i = 0; i < opt->tablesize; i++) {
struct idloc *ptr = ids[i];
int counter=0;
while (ptr != NULL) {
// fprintf(stdout, "ID: %s Position %ld\n", ptr->id, ptr->pos);
counter++;
ptr = ptr->next;
}
fprintf(stdout, "%d\t%d\n", i, counter);
}
}
/* now we want to open output files for left_paired, right_paired, and right_single */
FILE * left_paired;
FILE * left_single;
FILE * right_paired;
FILE * right_single;
char *lpfn = catstr(dupstr(left_fn), ".paired.fq");
char *rpfn = catstr(dupstr(right_fn), ".paired.fq");
char *lsfn = catstr(dupstr(left_fn), ".single.fq");
char *rsfn = catstr(dupstr(right_fn), ".single.fq");
printf("Writing the paired reads to %s and %s.\nWriting the single reads to %s and %s\n", lpfn, rpfn, lsfn, rsfn);
if ((left_paired = fopen(lpfn, "w")) == NULL ) {
fprintf(stderr, "Can't open file %s\n", lpfn);
exit(1);
}
if ((left_single = fopen(lsfn, "w")) == NULL) {
fprintf(stderr, "Can't open file %s\n", lsfn);
exit(1);
}
if ((right_paired = fopen(rpfn, "w")) == NULL) {
fprintf(stderr, "Can't open file %s\n", rpfn);
exit(1);
}
if ((right_single = fopen(rsfn, "w")) == NULL) {
fprintf(stderr, "Can't open file %s\n", rsfn);
exit(1);
}
/*
* Now read the second file, and print out things in common
*/
int left_paired_counter=0;
int right_paired_counter=0;
int left_single_counter=0;
int right_single_counter=0;
FILE *rfp;
if ((rfp = fopen(right_fn, "r")) == NULL) {
fprintf(stderr, "Can't open file %s\n", left_fn);
exit(1);
}
nextposition = 0;
while ((aline = fgets(line, MAXLINELEN, rfp)) != NULL) {
// make a copy of the current line so we can print it out later.
char *headerline = dupstr(line);
line[strcspn(line, "\n")] = '\0';
line[strcspn(line, " ")] = '\0';
/* remove the last character, as we did above */
char lastchar = line[strlen(line)-1];
char lastbutone = line[strlen(line)-2];
if ('/' == lastbutone || '_' == lastbutone || '.' == lastbutone)
if ('1' == lastchar || '2' == lastchar || 'f' == lastchar || 'r' == lastchar)
line[strlen(line)-1] = '\0';
// now see if we have the mate pair
unsigned hashval = hash(line) % opt->tablesize;
struct idloc *ptr = ids[hashval];
long int posn = -1; // -1 is not a valid file position
while (ptr != NULL) {
if (strcmp(ptr->id, line) == 0) {
posn = ptr->pos;
ptr->printed = true;
}
ptr = ptr->next;
}
if (posn != -1) {
// we have a match.
// lets process the left file
fseek(lfp, posn, SEEK_SET);
left_paired_counter++;
for (int i=0; i<=3; i++) {
aline = fgets(line, MAXLINELEN, lfp);
fprintf(left_paired, "%s", line);
}
// now process the right file
fprintf(right_paired, "%s", headerline);
right_paired_counter++;
for (int i=0; i<=2; i++) {
aline = fgets(line, MAXLINELEN, rfp);
fprintf(right_paired, "%s", line);
}
}
else {
fprintf(right_single, "%s", headerline);
right_single_counter++;
for (int i=0; i<=2; i++) {
aline = fgets(line, MAXLINELEN, rfp);
fprintf(right_single, "%s", line);
}
}
}
/* all that remains is to print the unprinted singles from the left file */
for (int i = 0; i < opt->tablesize; i++) {
struct idloc *ptr = ids[i];
while (ptr != NULL) {
if (! ptr->printed) {
fseek(lfp, ptr->pos, SEEK_SET);
left_single_counter++;
for (int n=0; n<=3; n++) {
aline = fgets(line, MAXLINELEN, lfp);
fprintf(left_single, "%s", line);
}
}
ptr = ptr->next;
}
}
fprintf(stderr, "Left paired: %d\t\tRight paired: %d\nLeft single: %d\t\tRight single: %d\n",
left_paired_counter, right_paired_counter, left_single_counter, right_single_counter);
fclose(lfp);
fclose(rfp);
/*
* Free up the memory for all the pointers
*/
for (int i = 0; i < opt->tablesize; i++) {
struct idloc *ptr = ids[i];
struct idloc *next;
while (ptr != NULL) {
next = ptr->next;
free(ptr);
ptr=next;
}
}
free(ids);
free(line);
return 0;
}
unsigned hash (char *s) {
unsigned hashval;
for (hashval=0; *s != '\0'; s++)
hashval = *s + 31 * hashval;
return hashval;
}
|