1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
|
/* This program counts trigraphs in all the dictionaries specified
and outputs a C++ include file of constants for gpw.C to use.
Change to a C include file by changing "const" to "static."
Output of this program needs postprocessing to eliminate comma closebrace,
see the makefile for gpw.
THVV 6/94 Coded
*/
#include "stdio.h"
#include "stdlib.h"
int tris[26][26][26]; /* Trigraph frequencies */
int duos[26][26]; /* Bigraph frequencies */
int sing[26]; /* Letter frequencies */
long max = 0; /* largest triraph count */
int m1, m2, m3; /* coords of largest count */
long sigma = 0; /* Total letters */
FILE *fp;
void checktris() {
int c1, c2, c3;
for(c1=0; c1<26; c1++) {
for(c2=0; c2<26; c2++) {
int hastris = 0;
for(c3=0; c3<26; c3++) {
if(tris[c1][c2][c3]) {
hastris = 1;
break;
}
}
if(!hastris) { /* if [c1][c2] has no tris... */
duos[c1][c2] = 0; /* ... do not use the duo ...*/
for(c3=0; c3<26; c3++) { /* ... and don't use tris */
sigma-=tris[c3][c1][c2];/* which are ending */
tris[c3][c1][c2] = 0; /* on [c1][c2] ...*/
}
}
}
}
}
int main (int argc, char ** argv) {
char buf[100];
int j;
int k1, k2, k3;
int c1, c2, c3;
char s1[2], s2[2], s3[2];
int argno, nfiles;
for (c1=0; c1 < 26; c1++) { /* Initialize arrays to zero */
sing[c1] = 0;
for (c2=0; c2 < 26; c2++) {
duos[c1][c2] = 0;
for (c3=0; c3 < 26; c3++) {
tris[c1][c2][c3] = 0;
}
}
}
s1[1] = '\0';
s2[1] = '\0';
s3[1] = '\0';
nfiles = 0; /* count of files read */
if (argc < 2) {
printf (" USAGE: loadtris /usr/dict/words ...");
exit (1);
}
for (argno = 1; argno < argc; argno++) {
if ((fp = fopen (argv[argno], "r")) == NULL) {
printf ("** file %s not found\n", argv[argno]);
break;
}
nfiles++;
while (fgets (buf, sizeof (buf), fp)) {
j = 0; /* j indexes the input */
k2 = -1; /* k1, k2 are coords of previous letter */
k1 = -1;
while (buf[j]) { /* until we find the null char.. */
k3 = buf[j]; /* Pick out a letter from the input */
if (k3 > 'Z') {
k3 = k3 - 'a'; /* map from a-z to 0-25 */
}
else {
k3 = k3 - 'A'; /* map from A-Z to 0-25 */
}
if (k3 >= 0 && k3 <= 25) { /* valid subscript? */
if (k1 >= 0) { /* do we have 3 letters? */
tris[k1][k2][k3]++; /* count */
sigma++; /* grand total */
if (tris[k1][k2][k3] > max) {
max = tris[k1][k2][k3];
m1 = k1; /* note largest cell.. */
m2 = k2; /* .. for interest */
m3 = k3;
}
}
if (k2 >= 0) {
duos[k2][k3]++; /* count 2-letter pairs */
}
sing[k3]++; /* count single letter frequency */
k1 = k2; /* shift over */
k2 = k3;
}
j++;
} /* while buf[j] */
} /* while fgets */
fclose (fp);
} /* for argno */
if (nfiles) { /* find any input? */
checktris();
printf ("/* BEGIN INCLUDE FILE .. trigram.h */\n"); /* Multics style */
printf ("\n");
printf ("const long sigma = %ld;\n", sigma);
/* (for my /usr/dict/words it is 125729, fits in a long) */
/* For interest print out the most frequent entry. */
/* (for my /usr/dict/words it is 863 = ATE, showing that a short works OK) */
s1[0] = m1 + 'a';
s2[0] = m2 + 'a';
s3[0] = m3 + 'a';
printf ("const short maxcell = %ld; /* %s%s%s */\n", max, s1, s2, s3);
printf ("const short sing[26] = {");
for (c1=0; c1 < 26; c1++) {
printf ("%d, ", sing[c1]);
}
printf ("};\n"); /* oops, ends in comma closebrace */
printf ("const short duos[26][26] = {");
for (c1=0; c1 < 26; c1++) {
s1[0] = c1+'A';
printf ("\n{/* %s */ ", s1);
for (c2=0; c2 < 26; c2++) {
printf ("%d, ", duos[c1][c2]);
}
printf("}");
if (c1!=25) printf(",");
}
printf ("};\n"); /* oops, ends in comma closebrace */
printf ("const short tris[26][26][26] = {");
for (c1=0; c1 < 26; c1++) {
printf("{");
for (c2=0; c2 < 26; c2++) {
s1[0] = c1+'A';
s2[0] = c2+'A';
printf ("\n{/* %s %s */ ", s1, s2);
for (c3=0; c3 < 26; c3++) {
printf ("%d, ", tris[c1][c2][c3]);
}
printf("}");
if (c2!=25) printf(",");
}
printf("}");
if (c1!=25) printf(",");
}
printf ("};\n"); /* comma closebrace again, fix later */
printf ("/* END INCLUDE FILE .. trigram.h */\n");
}
exit (0);
}
|