File: countpair.c

package info (click to toggle)
enca 1.21-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 4,948 kB
  • sloc: ansic: 10,297; sh: 5,858; xml: 2,132; makefile: 700; perl: 261
file content (121 lines) | stat: -rw-r--r-- 2,436 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#include <stdio.h>
#include <assert.h>

#define NO_CHR 0xffff

#define FILL_CHARACTER '.'

int
main(int argc, char *argv[])
{
  unsigned int letters[0x100];
  unsigned long int count[0x10000];
  FILE *f;

  unsigned long int c;
  int i, killme, last;
  unsigned long j, sum;

  if (argc < 1) {
    fprintf(stderr, "Error: Missing arguments!\n");
    return 0;
  }

  /* read letters */
  for (i = 0; i < 0x100; i++)
    letters[i] = 0;

  f = fopen(argv[1], "r");
  while (fscanf(f, "%lx", &j) == 1)
    letters[j] = 1;
  fclose(f);

  /* read stuff */
  for (i = 0; i < 0x10000; i++)
    count[i] = 0;

  killme = 0;
  last = FILL_CHARACTER;
  do {
    c = getchar();
    killme = ((int)c == EOF);
    if ((int)c == EOF || !letters[c])
      c = FILL_CHARACTER;
    count[last*0x100 + c]++;
    last = c;
  } while (!killme);

  /* note we put things into the same array. that's ugly. */
  sum = 0;
  last = 0;
  for (i = 0; i < 0x10000; i++) {
    if (i/0x100 == FILL_CHARACTER && i%0x100 == FILL_CHARACTER)
      count[i] = 0;
    else {
      assert(i >= last);
      if (count[i]) {
        sum += count[last++] = count[i];
        count[last++] = i;
      }
    }
  }

  /* sort by count */
  last /= 2;
  do {
    killme = 1;
    for (i = 1; i < last; i++) {
      if (count[2*i] > count[2*i-2]) {
        killme = 0;
        j = count[2*i];
        count[2*i] = count[2*i-2];
        count[2*i-2] = j;
        j = count[2*i+1];
        count[2*i+1] = count[2*i-1];
        count[2*i-1] = j;
      }
    }
  } while (!killme);

  /* kill small */
  sum = 0.95*sum;
  j = 0;
  for (i = 0; i < last; i++) {
    j += count[2*i];
    if (j > sum)
      break;
  }
  last = i;

  /* sort by first again */
  do {
    killme = 1;
    for (i = 1; i < last; i++) {
      /* note we sort by first letter only, so the second letters will be
       * sorted by frequency, which is exactly what we want */
      if (count[2*i+1]/0x100 < count[2*i-1]/0x100) {
        killme = 0;
        j = count[2*i];
        count[2*i] = count[2*i-2];
        count[2*i-2] = j;
        j = count[2*i+1];
        count[2*i+1] = count[2*i-1];
        count[2*i-1] = j;
      }
    }
  } while (!killme);

  i = 0;
  while (i < last) {
    c = count[2*i+1]/0x100;
    printf("%c:", (char)c);
    while (i < last && count[2*i+1]/0x100 == c) {
      printf("%c", (char)count[2*i+1]%0x100);
      i++;
    }
    printf("\n");
  }
  return 0;
}
/* vim: ts=2
 */