File: unihist.c

package info (click to toggle)
uniutils 2.28-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 6,032 kB
  • sloc: ansic: 49,905; sh: 1,168; awk: 55; makefile: 18
file content (219 lines) | stat: -rw-r--r-- 6,639 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
/* Time-stamp: <2008-04-03 19:48:48 poser> */
/*
 * Copyright (C) 2005-2008 William J. Poser (billposer@alum.mit.edu)
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of version 3 the GNU General Public License
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 * or go to the web page:  http://www.gnu.org/licenses/gpl.txt.
 */

#include "config.h"
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#ifdef HAVE_LOCALE_H
#include <locale.h>
#endif
#ifdef HAVE_LIBINTL_H
#include <libintl.h>
#else
#define gettext(x) (x)
#endif
#include "unicode.h"
#include "utf8error.h"

#ifdef BMPONLY
#define CHARSETSIZE 65536 	/* 16 bits */
#else
#define CHARSETSIZE 2097152	/* Full Unicode = 21 bits */
#endif

#define MSGSIZE 128

char pgname[]="unihist";
char compdate[]=__DATE__ " " __TIME__ ;

char msg [MSGSIZE];

static long CharacterCounts[CHARSETSIZE];

void
ShowUsage(void){
  fprintf(stderr,
	  gettext("Filter to generate a histogram of the Unicode characters in the input.\n"));
  fprintf(stderr,
	  gettext("       -c Suppress printing of counts and percentages.\n"));
  fprintf(stderr,
	  gettext("       -g Suppress printing of glyphs.\n"));
  fprintf(stderr,
	  gettext("       -u Suppress printing of code as text.\n"));
  fprintf(stderr,
	  gettext("       -h Print this help message.\n"));
  fprintf(stderr,
	  gettext("       -v Print version.\n"));
}

void
ShowVersion(void){
  fprintf(stderr,"\n%s  %s\n",pgname,PACKAGE_VERSION);
#ifdef BMPONLY
  fprintf(stderr,gettext("This version compiled to handle only the BMP.\n"));
#endif
  fprintf(stderr,gettext("Compiled %s.\n"),compdate);
  fprintf(stderr,"Copyright 2005-2008 William J. Poser\n");
  fprintf(stderr,gettext("Released under the terms of the GNU General Public License, version 3.\n"));
  fprintf(stderr,gettext("Report bugs to: billposer@alum.mit.edu.\n"));
}

int
HandleReadError(UTF32 c, unsigned char *rp, long LineCnt, long CharCnt, long ByteCnt)
{

  extern void ExplicateBadUTF8(FILE *, unsigned char *);

  switch (c)
    { 
    case UTF8_NOTENOUGHBYTES:
      fprintf(stderr,gettext("Truncated UTF-8 sequence encountered at line %ld, character %ld, byte %ld.\n"),LineCnt, CharCnt, ByteCnt);
      exit(1);
      break;
    case UTF8_BADINCODE:
      fprintf(stderr,gettext("Invalid UTF-8 code encountered at line %ld, character %ld, byte %ld.\n"),LineCnt, CharCnt, ByteCnt);
      ExplicateBadUTF8(stderr,rp);
      exit(1);
      break;
    case UTF8_BADOUTCODE:
      fprintf(stderr,gettext("Encountered invalid Unicode at line %ld, character %ld, byte %ld.\n"),LineCnt, CharCnt, ByteCnt);
      exit(1);
      break;
    case UTF8_IOERROR:
      snprintf(msg,MSGSIZE-1,gettext("Error reading input at line %ld, character %ld, byte %ld.\n"),LineCnt,CharCnt,ByteCnt);
      perror(msg);
      exit(1);
      break;
    default:			/* Normal EOF */
      return(0);
      break;			/* NOTREACHED */
    }
}

/* Decides whether a character is displayable */
short DisplayableP(wchar_t c){
  if(c <= 0x20) return 0;	/* ASCII control characters */
  if((c >= 0x7F) && (c <= 0xA0) ) return 0;	/* ASCII DEL and Unicode control characters */
  if(c == 0x3000) return 0;	/* Ideographic space */
  if(c == 0xFEFF) return 0; 	/* Zero width no break space */
  if((c >= 0x2000) && (c <= 0x200F) ) return 0;	/* Various spaces and direction codes */
  return 1;
}

int
main(int ac, char **av)
{
   wchar_t c;			/* Input character as UTF-32 */
   long i;			/* Loop variable */
   int infd;			/* Input file descriptor */
   int oc;			/* Getopt option flag  */
   long TotalChars;		/* Total number of characters in input */
   long ByteOffset = 0L;	/* Offset from beginning of input in bytes, counting from zero */
   long LineNumber = 0L;
   int UCBytes;			/* Bytes of input occupied by the current input character */
   unsigned char *rawptr;
   short CntP    =1;		/* Output counts and percentages? */
   short CodeP  = 1;		/* Output UTF-32 codes in ascii hex?*/
   short GlyphP = 1;		/* Output UTF-8 glyphs? */


#ifdef HAVE_NUMBER_GROUP_SEPARATOR
   char *fmtstr = "\t%7.3f\t%'8ld";
#else
   char *fmtstr = "\t%7.3f\t%8ld";
#endif

   extern int optopt;
   extern int opterr;
   extern UTF32 Get_UTF32_From_UTF8 (int,int *,unsigned char **);
   extern void putu8 (wchar_t);

   opterr = 0;			/* We'll handle errors ourselves */
   while( (oc = getopt(ac,av,"cghuv")) != EOF){
     switch(oc){
     case 'c':
       CntP = 0;
       break;
     case 'g':
       GlyphP = 0;
       break;
     case 'h':
       ShowUsage();
       exit(1);
     case 'u':
       CodeP = 0;
       break;
     case 'v':
       ShowVersion();
       exit(1);
     default:
       fprintf(stderr,gettext("%s: invalid option flag %c\n"),pgname,optopt);
       ShowVersion();
       ShowUsage();
       exit(2);
     }
   }

#ifdef HAVE_SETLOCALE
   setlocale(LC_ALL,"");
#endif
#ifdef HAVE_LIBINTL_H
   bindtextdomain (PACKAGE, LOCALEDIR);
   textdomain (PACKAGE);
#endif

   infd = fileno(stdin);

   /* Initialize counts */
   for(i=0L;i<CHARSETSIZE;i++) CharacterCounts[i] = 0L;
   TotalChars = 0L;
   
   /* Count */
   while ( (c = Get_UTF32_From_UTF8(infd,&UCBytes,&rawptr)) <= UNI_MAX_UTF32){  
     ByteOffset += UCBytes;
     if (c == 0x000A) LineNumber++;
#ifdef BMPONLY
     if(c > CHARSETSIZE){
       fprintf(stderr,
	       gettext("Encountered input outside the BMP (plane 0): 0x%06X at byte offset %ld.\n"),
	       c,ByteOffset-UCBytes);
       fprintf(stderr,
	       gettext("This version of %s has been compiled to handle only characters in the BMP.\n"),pgname);
       fprintf(stderr,
	       gettext("Recompile with BMPONLY undefined to handle all of Unicode.\n"));
       exit(3);
     }
#endif
     CharacterCounts[c]+=1L;
     ++TotalChars;
   }
   (void)HandleReadError(c,rawptr,LineNumber,TotalChars,ByteOffset);

   /* Output */
   for(i=0L;i < CHARSETSIZE; i++){
     if(CharacterCounts[i] == 0) continue;
     if(CntP) printf(fmtstr,100.0 * (((double)CharacterCounts[i])/(double)TotalChars),CharacterCounts[i]);
     if(CodeP) printf("\t0x%06lX",i);
     if(GlyphP && DisplayableP(i)){ putchar('\t');putu8(i);}
     putchar('\n');
   }

   exit(0);
}