1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
|
/*******************************************************************************************
*
* Synthetic DNA shotgun sequence generator
* Generate a fake genome of size genlen*1Mb long, that has an AT-bias of -b.
* The -r parameter seeds the random number generator for the generation of the genome
* so that one can reproducbile produce the same underlying genome to sample from. If
* missing, then the job id of the invocation seeds the generator. The sequence is
* sent to the standard output in .fasta format.
*
* Author: Gene Myers
* Date : April 2016
*
********************************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
static char *Usage = "<genlen:double> [-U] [-b<double(.5)>] [-w<int(80)>] [-r<int>]";
static int GENOME; // -g option * 1Mbp
static double BIAS; // -b option
static int HASR = 0; // -r option is set?
static int SEED; // -r option
static int WIDTH; // -w option
static int UPPER; // -U option
static char *Prog_Name;
// Generate a random DNA sequence of length *len* with an AT-bias of BIAS.
// Uppercase letters if UPPER is set, lowercase otherwise.
static char *random_genome(int len)
{ static char *seq = NULL;
static double x, PRA, PRC, PRG;
int i;
if (seq == NULL)
{ PRA = BIAS/2.;
PRC = (1.-BIAS)/2. + PRA;
PRG = (1.-BIAS)/2. + PRC;
if ((seq = (char *) malloc(WIDTH+1)) == NULL)
{ fprintf(stderr,"%s: Allocating genome sequence\n",Prog_Name);
exit (1);
}
}
if (UPPER)
for (i = 0; i < len; i++)
{ x = drand48();
if (x < PRC)
if (x < PRA)
seq[i] = 'A';
else
seq[i] = 'C';
else
if (x < PRG)
seq[i] = 'G';
else
seq[i] = 'T';
}
else
for (i = 0; i < len; i++)
{ x = drand48();
if (x < PRC)
if (x < PRA)
seq[i] = 'a';
else
seq[i] = 'c';
else
if (x < PRG)
seq[i] = 'g';
else
seq[i] = 't';
}
seq[len] = '\0';
return (seq);
}
int main(int argc, char *argv[])
{ int i, j;
char *eptr;
double glen;
// Process command arguments
//
// Usage: <GenomeLen:double> [-b<double(.5)>] [-r<int>]
Prog_Name = strdup("rangen");
WIDTH = 80;
BIAS = .5;
HASR = 0;
UPPER = 0;
j = 1;
for (i = 1; i < argc; i++)
if (argv[i][0] == '-')
switch (argv[i][1])
{ default:
fprintf(stderr,"%s: %s is an illegal option\n",Prog_Name,argv[i]);
exit (1);
case 'U':
if (argv[i][2] != '\0')
{ fprintf(stderr,"%s: %s is an illegal option\n",Prog_Name,argv[i]);
exit (1);
}
UPPER = 1;
break;
case 'b':
BIAS = strtod(argv[i]+2,&eptr);
if (*eptr != '\0' || argv[i][2] == '\0')
{ fprintf(stderr,"%s: -%c '%s' argument is not a real number\n",
Prog_Name,argv[i][1],argv[i]+2);
exit (1);
}
if (BIAS < 0. || BIAS > 1.)
{ fprintf(stderr,"%s: AT-bias must be in [0,1] (%g)\n",Prog_Name,BIAS);
exit (1);
}
break;
case 'r':
SEED = strtol(argv[i]+2,&eptr,10);
HASR = 1;
if (*eptr != '\0' || argv[i][2] == '\0')
{ fprintf(stderr,"%s: -r argument is not an integer\n",Prog_Name);
exit (1);
}
break;
case 'w':
WIDTH = strtol(argv[i]+2,&eptr,10);
if (*eptr != '\0' || argv[i][2] == '\0')
{ fprintf(stderr,"%s: -w '%s' argument is not an integer\n",Prog_Name,argv[i]+2);
exit (1);
}
if (WIDTH < 0)
{ fprintf(stderr,"%s: Line width must be non-negative (%d)\n",Prog_Name,WIDTH);
exit (1);
}
break;
}
else
argv[j++] = argv[i];
argc = j;
if (argc != 2)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
fprintf(stderr,"\n");
fprintf(stderr," -r: Random number generator seed (default is process id).\n");
fprintf(stderr," -b: AT vs GC ratio or bias.\n");
fprintf(stderr," -w: Print -w bp per line (default is 80).\n");
fprintf(stderr," -U: Output sequence in upper case (default is lower case).\n");
exit (1);
}
glen = strtod(argv[1],&eptr);
if (*eptr != '\0')
{ fprintf(stderr,"%s: genome length is not a real number\n",Prog_Name);
exit (1);
}
if (glen < 0.)
{ fprintf(stderr,"%s: Genome length must be positive (%g)\n",Prog_Name,glen);
exit (1);
}
GENOME = (int) (glen*1000000.);
// Set up random number generator
if (HASR)
srand48(SEED);
else
srand48(getpid());
// Generate the sequence line at a time where all lines have width WDITH, save the last.
fprintf(stdout,">random len=%d bias=%g\n",GENOME,BIAS);
for (j = 0; j+WIDTH < GENOME; j += WIDTH)
fprintf(stdout,"%s\n",random_genome(WIDTH));
if (j < GENOME)
fprintf(stdout,"%s\n",random_genome(GENOME-j));
exit (0);
}
|