File: gtm_utf8.c

package info (click to toggle)
fis-gtm 6.3-007-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 36,284 kB
  • sloc: ansic: 328,861; asm: 5,182; csh: 5,102; sh: 1,918; awk: 291; makefile: 69; sed: 13
file content (415 lines) | stat: -rwxr-xr-x 14,261 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
/****************************************************************
 *								*
 * Copyright (c) 2006-2018 Fidelity National Information	*
 * Services, Inc. and/or its subsidiaries. All rights reserved.	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"

#include <stdarg.h>
#include "gtm_string.h"
#include "gtm_stdlib.h"

#include "error.h"
#include "util.h"
#include "gtm_icu_api.h"
#include "gtm_utf8.h"
#include "cmd_qlf.h"

GBLREF command_qualifier	cmd_qlf;
GBLREF	boolean_t		badchar_inhibit;
GBLREF	boolean_t		gtm_utf8_mode;
GBLREF	void			(*stx_error_fptr)(int in_error, ...);	/* Function pointer for stx_error() so gtm_utf8.c can avoid
									 * pulling in stx_error() in gtmsecshr.
									 */
GBLREF	void			(*show_source_line_fptr)(boolean_t warn); /* Func ptr for show_source_line() same reason as above */

error_def(ERR_BADCHAR);

/* Return UTF8 length of mstr string in UTF8 characters */
int utf8_len(mstr* str)
{
	return utf8_len_real(err_rts, str);
}

/* This is the same as "utf8_len" except that it invokes UTF8_BADCHAR_STX macro which does a stx_error instead of rts_error
 * when an invalid UTF8 character is detected in the string (and badchar_inhibit is not enabled).
 * If UTF8_BADCHAR_STX is invoked, this function returns a -1 signalling a parse error.
 */
int utf8_len_stx(mstr* str)
{
	return utf8_len_real(err_stx, str);
}

/* This is the same as "utf8_len" except that it invokes UTF8_BADCHAR_DEC macro which does a dec_err instead of rts_error.
 * Note only one "error" is raised for any given string and we return the length as best we can with the broken string.
 */
int utf8_len_dec(mstr* str)
{
	return utf8_len_real(err_dec, str);
}

/* The routine that does the actual work of determining the length and responding appropriately in the event an invalid
 * UTF8 character is detected.
 */
STATICFNDEF int utf8_len_real(utf8_err_type err_type, mstr* str)
{
	int		charlen, bytelen;
	char		*ptrtop, *ptr;
	boolean_t	err_raised;

	assert(gtm_utf8_mode);
	ptr = str->addr;
	ptrtop = ptr + str->len;
	charlen = 0;
	err_raised = FALSE;
	if (!badchar_inhibit)
	{
		for (; ptr < ptrtop; charlen++, ptr += bytelen)
		{
			if (!UTF8_VALID(ptr, ptrtop, bytelen))
			{
				switch(err_type)
				{
					case err_rts:
						UTF8_BADCHAR(0, ptr, ptrtop, 0, NULL);
						break;			/* Never get here but keeps compiler happy */
					case err_stx:
						UTF8_BADCHAR_STX(0, ptr, ptrtop, 0, NULL);
						return -1;
					case err_dec:
						if (!err_raised)
						{
							UTF8_BADCHAR_DEC(0, ptr, ptrtop, 0, NULL);
							err_raised = TRUE;
						}
						bytelen = 1;		/* Assume only one char is broken */
						break;
					default:
						assertpro(FALSE /* Invalid error type */);
				}
			}
		}
	} else
	{
		for (; ptr < ptrtop; charlen++)
			ptr = (char *)UTF8_MBNEXT(ptr, ptrtop);
	}
	assert(ptr == ptrtop);
	str->char_len = charlen;
	return charlen;
}

/* Similar to utf8_len() except it operates on a given string instead of an mval and does not observe badchar_inhibit.
 * String must be valid or error is raised.
 */
int utf8_len_strict(unsigned char* ptr, int len)
{
	int		charlen, bytelen;
	unsigned char	*ptrtop;

	ptrtop = ptr + len;
	for (charlen = 0; ptr < ptrtop; charlen++, ptr += bytelen)
	{
		if (!UTF8_VALID(ptr, ptrtop, bytelen))
			UTF8_BADCHAR(0, ptr, ptrtop, 0, NULL);
	}
	assert(ptr == ptrtop);
	return charlen;
}

/* Returns the total display column width of a UTF-8 string given its address and byte length.
 * The third parameter (strict) is used to specify how both illegal characters should be handled.
 * The fourth parameter (nonprintwidth) is to specify what width to give for unprintable characters.
 *	It is currently 0 if coming through $ZWIDTH and 1 if coming through util_output (for historical reasons).
 * If strict is TRUE, this routine
 * 	- triggers BADCHAR error if it encounters any illegal characters irrespective of VIEW BADCHAR setting.
 * If strict is FALSE, this routine
 * 	- does NOT do BADCHAR check.
 *	- treats illegal characters as unprintable characters (for width).
 */
int gtm_wcswidth(unsigned char* ptr, int len, boolean_t strict, int nonprintwidth)
{
	int		strwidth, cwidth;
	uint4		ch;
	unsigned char	*ptrtop, *ptrnext;

	assert(gtm_utf8_mode);
	ptrtop = ptr + len;
	for (strwidth = 0; ptr < ptrtop; ptr = ptrnext)
	{
		ptrnext = UTF8_MBTOWC(ptr, ptrtop, ch);
		if (WEOF != ch && -1 != (cwidth = UTF8_WCWIDTH(ch)))
			strwidth += cwidth;
		else if (strict && (WEOF == ch))
			UTF8_BADCHAR(0, ptr, ptrtop, 0, NULL);
		else
			strwidth += nonprintwidth;
	}
	assert(ptr == ptrtop);
	return strwidth;
}

/* Returns the display column width of a character given its code point. This function
 * returns -1 for control characters and 0 for non-spacing (combining) characters.
 *
 * NOTEs:
 * We are not using libc's wcwidth() due to its inconsistent behavior across different
 * platforms and its incorrect behavior for several characters (even on Linux).
 *
 * ICU does not provide a direct API for display width, however, it does provide API
 * for the property "East Asian Width" specified in UAX#11 (http://unicode.org/reports/tr11/)
 * which provides guidelines to determine the width for the entire repertoire.
 *
 * Using "East Asian Width" and "General Category" properties. gtm_wcwidth() determines
 * the column width as below:
 * 	- SOFT-HYPHEN is a special format control character with a width of 1.
 *	- Non-spacing combining marks and Enclosing combining marks (general
 *	  category codes 'Mn' and 'Me') have a column width of 0. Note that Combing spacing
 *	  marks (General Category 'Mc') occupy a column width of 1.
 *	- Conjoining Hangul Jamos (i.e. vowels and trailing consonants between U+1160 -
 *	  U+11FF) have a column with of 0. They are like the combining marks in that they
 *	  attach to their previous characters (although they categorized as letters).
 *	- All wide characters (East Asian Width - Wide (W) and Full-Width (F)) have a
 *	  column width of 2 and all narrow characters (East Asian Width - Narrow (Na)
 *	  and Half-Width (H)) have a column width of 1.
 *	- All characters (with East Asian Width - Neutral (N) and Ambiguous (A)) have a
 *	  column width of 1.
 * 	- All other non-printable (control characters) and unassigned code points (empty blocks)
 * 	  have a width -1.
 */
int gtm_wcwidth(wint_t code)
{
	UCharCategory		gc;	/* General category as defined by the standard */
	UEastAsianWidth		ea;
	UHangulSyllableType	hst;

	assert(gtm_utf8_mode);
	if (0x00ad == code) /* SOFT-HYPHEN, a special format control character */
		return 1;
	gc = (UCharCategory)u_getIntPropertyValue((UChar32)code, UCHAR_GENERAL_CATEGORY);
	if (U_NON_SPACING_MARK == gc || U_ENCLOSING_MARK == gc || /* combining marks (Mn, Me) */
		U_FORMAT_CHAR == gc || /* all other format control (Cf) characters */
		U_HST_VOWEL_JAMO == (hst = (UHangulSyllableType)u_getIntPropertyValue((UChar32)code,
			UCHAR_HANGUL_SYLLABLE_TYPE)) ||
		U_HST_TRAILING_JAMO == hst) /* conjoining hangul jamos (in Korean) */
	{
		return 0;
	}
	if (U_ISPRINT((UChar32)code))
	{
		ea = (UEastAsianWidth)u_getIntPropertyValue((UChar32)code, UCHAR_EAST_ASIAN_WIDTH);
		return (U_EA_FULLWIDTH == ea || U_EA_WIDE == ea) ? 2 : 1;
	}
	return -1;
}

/* This function issues a BADCHAR error and prints the sequences of bytes that comprise the bad multi-byte character.
 * If "len" is 0, the function determines how many bytes this multi-byte character is comprised of and prints all of it.
 * If "len" is non-zero, the function prints "len" number of bytes from "str" in the error message.
 */
void utf8_badchar(int len, unsigned char *str, unsigned char *strtop, int chset_len, unsigned char *chset)
{
	DCL_THREADGBL_ACCESS;
	SETUP_THREADGBL_ACCESS;

	assert(!TREF(compile_time));
	utf8_badchar_real(err_rts, len, str, strtop, chset_len, chset);
	return;
}

/* This function is the same as "utf8_badchar" except that it does a "stx_error" instead of "rts_error". This helps
 * to identify the line in the M program that has the compile time error.
 */
void utf8_badchar_stx(int len, unsigned char *str, unsigned char *strtop, int chset_len, unsigned char *chset)
{
	DCL_THREADGBL_ACCESS;
	SETUP_THREADGBL_ACCESS;

	assert(!TREF(compile_time));
	utf8_badchar_real(err_stx, len, str, strtop, chset_len, chset);
	return;
}

/* This function is the same as "utf8_badchar" except that it does a "dec_err()" instead of "rts_error". This helps
 * to identify the line in the M program that has the compile time error but unlike stx_error(), it does not remove
 * the relevant generated code and replace it with a runtime OC_RTERROR triple. This is because the runtime code does
 * the same level of checking as the compiler does so can detect the error "normally" on its own - no need for the
 * complication with trying to cut out the right set of triples. We can just put out the compiler message and let it
 * be.
 */
void utf8_badchar_dec(int len, unsigned char *str, unsigned char *strtop, int chset_len, unsigned char *chset)
{
	utf8_badchar_real(err_dec, len, str, strtop, chset_len, chset);
	return;
}

/* This function issues a BADCHAR error and prints the sequences of bytes that comprise the bad multi-byte character.
 * If "len" is 0, the function determines how many bytes this multi-byte character is comprised of and prints all of it.
 * If "len" is non-zero, the function prints "len" number of bytes from "str" in the error message.
 * This is the work-horse routine for the 3 above variants of utf8_badchar*(). The differences are in how the error
 * is delivered and what happens afterwards. For the 3 options:
 *
 *    err_rts    - uses rts_error() to raise the error
 *    err_stx    - uses stx_error to raise the error
 *    err_dec	 - uses dec_err to raise the error
 */
STATICFNDEF void utf8_badchar_real(utf8_err_type err_type, int len, unsigned char *str, unsigned char *strtop, int chset_len,
				   unsigned char *chset)
{
	unsigned char 	*strptr, *strend, *outstr;
	unsigned char	errtxt[OUT_BUFF_SIZE];
	int		tmplen;

	DCL_THREADGBL_ACCESS;
	SETUP_THREADGBL_ACCESS;

	assert(gtm_utf8_mode);
	if (0 == len)
	{	/* Determine the maximal length (upto 4 bytes) of the invalid byte sequence */
		for (strend = str; len <= 4 && strend < strtop; ++strend, ++len)
		{
			if (UTF8_VALID(strend, strtop, tmplen))
				break;
		}
	} else
		strend = str + len;
	strptr = str;
	outstr = &errtxt[0];
	for (; strptr < strend; ++strptr, ++outstr)
	{
		outstr = (unsigned char*)i2asc((uchar_ptr_t)outstr, *strptr);
		*outstr = ',';
	}
	if (0 < len)		/* do not include the last comma */
		outstr--;
	if (err_dec == err_type)
	{
		assert(NULL != show_source_line_fptr);
		if (!(TREF(compile_time) && !(cmd_qlf.qlf & CQ_WARNINGS)))
			(*show_source_line_fptr)(TRUE);	/* Print errant src line and pointer to where parsing detected the error */
	}
	if (0 < chset_len)
	{
		switch(err_type)
		{
			case err_rts:
				rts_error_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_BADCHAR, 4, (outstr - &errtxt[0]), &errtxt[0],
					      chset_len, chset);
				break;		/* Never get here but keeps compiler happy */
			case err_stx:
				assert(NULL != stx_error_fptr);
				(*stx_error_fptr)(ERR_BADCHAR, 4, (outstr - &errtxt[0]), &errtxt[0], chset_len, chset);
				break;
			case err_dec:
				if (!(TREF(compile_time) && !(cmd_qlf.qlf & CQ_WARNINGS)))
					dec_err(VARLSTCNT(6) (TREF(compile_time)
						? MAKE_MSG_TYPE(ERR_BADCHAR, WARNING) : ERR_BADCHAR),
						4, (outstr - &errtxt[0]), &errtxt[0], chset_len, chset);
				break;
			default:
				assertpro(FALSE /* Invalid error type */);
		}
	} else
	{

		switch(err_type)
		{
			case err_rts:
				rts_error_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_BADCHAR, 4, (outstr - &errtxt[0]), &errtxt[0],
					      LEN_AND_LIT(UTF8_NAME));
				break;		/* Never get here but keeps compiler happy */
			case err_stx:
				assert(NULL != stx_error_fptr);
				(*stx_error_fptr)(ERR_BADCHAR, 4, (outstr - &errtxt[0]), &errtxt[0], LEN_AND_LIT(UTF8_NAME));
				break;
			case err_dec:
				if (!(TREF(compile_time) && !(cmd_qlf.qlf & CQ_WARNINGS)))
					dec_err(VARLSTCNT(6) (TREF(compile_time)
					? MAKE_MSG_TYPE(ERR_BADCHAR, WARNING): ERR_BADCHAR),
					4, (outstr - &errtxt[0]), &errtxt[0], LEN_AND_LIT(UTF8_NAME));
				break;
			default:
				assertpro(FALSE /* Invalid error type */);
		}
	}
}

/* This function scans the string from the beginning and stops the moment it finds an invalid byte sequence.
 * It null-terminates the string from then onwards until the end.
 */
unsigned char *gtm_utf8_trim_invalid_tail(unsigned char *str, int len)
{
	unsigned char	*ptrend, *ptr;
	int		bytelen;

	ptr = str;
	ptrend = str + len;
	while (ptr < ptrend)
	{
		if (UTF8_VALID(ptr, ptrend, bytelen))
			ptr += bytelen;
		else
			break;
	}
	for ( ; ptr < ptrend; ptr++)
		*ptr = '\0';
	return ptr;
}

/* Remove the trailing line terminator from buffer.
 * buffer	line in ICU UChar format
 * len		length of line as number of UChar characters
 * 			as given by u_strlen()
 * Returns	number of characters after removing line terminator
 */
int trim_U16_line_term(UChar *buffer, int len)
{
	int	lt_index;
	UChar32	uc32_cp;

	if (0 == len)
		return 0;		/* zero length string */
	U16_GET(buffer, 0, len - 1, len, uc32_cp);
	for (lt_index = 0; u32_line_term[lt_index]; lt_index++)
		if (uc32_cp == u32_line_term[lt_index])
			break;
	if ((U32_LT_LF == lt_index) && (1 < len))
	{
		U16_GET(buffer, 0, len - 2, len, uc32_cp);
		if (u32_line_term[U32_LT_CR] == uc32_cp)
			len--;		/* trim both CR and LF */
	}
	if (U32_LT_LAST >= lt_index)
	{
		buffer[len - 1] = 0;
		return (len - 1);
	}
	return len;		/* no line terminator so return it all */
}

boolean_t valid_utf_string(const mstr *str)
{
	int		charlen, bytelen;
	char		*ptrtop, *ptr;

	ptr = str->addr;
	ptrtop = ptr + str->len;
	charlen = 0;

	for (; ptr < ptrtop; charlen++, ptr += bytelen)
	{
		if (!UTF8_VALID(ptr, ptrtop, bytelen))
		{	/* Emit a warning if there is an issue*/
			UTF8_BADCHAR_DEC(0, ptr, ptrtop, 0, NULL);
			return FALSE;
		}
	}
	return TRUE;
}