1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
|
/*
Please see attachment for the sample program : It takes distribution
from stdin, and output to stdout(some information to stderr).
Probabilities don't need to sum up to 1. In the output, each article is
separated by an empty line; each word occupies a single line.
The job would be to write code that, given a probability distribution
over words, (in the form:
0.022 foo
0.015 bar
0.001 baz
...
) would produce 60 documents of 200 words each, where the words would
be sampled from the given distribution.
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
/* Defaults for command-line arguments. */
/* The number of documents to generate */
int ndocs = 100;
/* The number of words per document */
int nwords_per_doc = 20;
/* Prefix to each filename created */
const char *prefix = NULL;
/* Directory into which to place the documens. */
const char *dirname = NULL;
int noise_vocab_size = 0;
float noise_vocab_fraction = 0;
/* maximum number of words */
#define MAX 99999
struct {
float P;
char *w;
}word[MAX];
void
print_usage (const char *argv[])
{
fprintf (stderr, "usage: %s "
"[-d dirname] [-p prefix] [-l doclen] [-n ndocs]\n"
"[-v noisevocabsize] [-f noisevocabfrac] distfile\n"
" Will output NDOCS files each of length DOCLEN with"
" filenames having \n"
" PREFIX to directory DIRNAME.\n"
" With probability NOISEVOCABFRAC, instead of picking"
" a word from the\n"
" distribution specified by DISTFILE, a word will be chosen"
" uniformly \n"
" from one of NOISEVOCABSIZE noise-words\n"
, argv[0]);
}
int
main (int argc, const char *argv[])
{
int argi, N, i=0;
float x;
char s[256];
FILE *fp;
char docname[1024];
const char *distfile;
int e;
for (argi = 1; argi < argc; argi++)
{
if (argv[argi][0] != '-')
break;
switch (argv[argi][1])
{
case 'd':
dirname = argv[++argi];
break;
case 'p':
prefix = argv[++argi];
break;
case 'l':
nwords_per_doc = atoi (argv[++argi]);
break;
case 'n':
ndocs = atoi (argv[++argi]);
break;
case 'v':
noise_vocab_size = atoi (argv[++argi]);
break;
case 'f':
noise_vocab_fraction = atof (argv[++argi]);
break;
case '?':
case 'h':
print_usage (argv);
exit (0);
default:
fprintf (stderr, "%s: unrecognized option `%s'\n",
argv[0], argv[argi]);
print_usage (argv);
exit (-1);
}
}
distfile = argv[argi];
if (dirname && dirname[0] == '/')
fprintf (stderr, "Output to %s\n", dirname);
else
fprintf (stderr, "Output to ./%s\n", dirname);
/* mkdir (dirname, S_IRWXU | S_IRWXG | S_IRWXO); */
/* read in prob. distribution */
fp = fopen (distfile, "r");
while (i<MAX && fscanf(fp, "%f %s", &x, s)==2)
{
word[i].P = i==0? x : word[i-1].P+x;
word[i].w = (char *)malloc(strlen(s)+1);
strcpy(word[i].w, s);
i++;
}
fclose (fp);
if (i>=MAX)
{
printf("Error: number of words exceeds %d\n", MAX);
exit (-1);
}
N = i;
fprintf(stderr, "Cumulative Prob.=%f\n", word[N-1].P);
/* Create the directory if necessary */
e = mkdir (dirname, 0777);
if (e != 0 && errno != EEXIST)
{
fprintf (stderr, "Error creating directory `%s'\n", dirname);
perror ("dice");
exit (-1);
}
/* generate documents */
for (i = 0; i < ndocs; i++)
{
/* each with NWORDS_PER_DOC words */
int j;
if (prefix)
sprintf (docname, "%s/%s%05d", dirname, prefix, i);
else
sprintf (docname, "%s/%05d", dirname, i);
fp = fopen (docname, "w");
assert (fp);
for (j=0; j<nwords_per_doc; j++)
{
if (noise_vocab_fraction
&& rand()/(float)RAND_MAX > noise_vocab_fraction)
{
int wn = rand () % noise_vocab_size;
fprintf (fp, "noise");
/* Convert number WN into alphabetics */
while (wn)
{
fprintf (fp, "%c", 'a' + wn % 10);
wn /= 10;
}
fprintf (fp, "\n");
}
else
{
float r= rand()/(float)RAND_MAX * word[N-1].P;
int k=0;
while (word[k].P<r) k++;
fprintf(fp, "%s\n", word[k].w);
}
}
fprintf (fp, "\n");
fclose (fp);
}
exit (0);
}
|