File: utfcgr.h

package info (click to toggle)
fis-gtm 7.1-006-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 32,908 kB
  • sloc: ansic: 344,906; asm: 5,184; csh: 4,859; sh: 2,000; awk: 294; makefile: 73; sed: 13
file content (155 lines) | stat: -rw-r--r-- 8,165 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/****************************************************************
 *								*
 * Copyright (c) 2015 Fidelity National Information 		*
 * Services, Inc. and/or its subsidiaries. All rights reserved.	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

/* Header file for using UTF8 lookaside cache to aid in scanning of UTF8 strings. Similar
 * in concept to the fnpc.h lookaside cache for $PIECE() performance improvements except
 * here, we are logging mode changes between BADCHARs, UTF8 and ASCII character groups.
 */

#ifndef UTFCGR_INCLUDED
#define UTFCGR_INCLUDED

#include "utfcgr_trc.h"

/* Debugging counters */
#ifdef DEBUG
GBLREF	uint4	process_id;
GBLREF	int	u_miss;				/* UTF cache misses (debug) */
GBLREF	int	u_hit;				/* UTF cache hits (debug) */
GBLREF	int	u_small;			/* UTF scanned small string brute force (debug) */
GBLREF	int	u_pskip;			/* Number of UTF groups "skipped" (debug) */
GBLREF	int	u_puscan;			/* Number of groups "scanned" for located char (debug) */
GBLREF	int	u_pabscan;			/* Number of non-UTF groups we scan for located char (debug) */
GBLREF	int	u_parscan;			/* Number of partial scans (partial cache hits) (debug) */
GBLREF	int	u_parhscan;			/* Number of partial scans after filled slots (debug) */
#  define COUNT_UTF_EVENT(X)	++u_##X;
#  define COUNT_UTF_INCR(X, Y)	u_##X += Y;
#else
#  define COUNT_UTF_EVENT(X)
#  define COUNT_UTF_INCR(X, Y)
#endif

/* Macro to determine the type and length of a character.
 * Arguments:
 *    CPTR    - pointer to first byte of character.
 *    CTOPPTR - pointer to last byte of string (max possible part of char).
 *    CTYPE   - variable where determined type is stored.
 *    CLEN    - length in bytes of character.
 */
#define UTF_CHARTYPELEN(CPTR, CTOPPTR, CTYPE, CLEN)						\
MBSTART {											\
	if (ASCII_MAX >= *(CPTR))								\
	{	/* We have an ASCII char */							\
		CTYPE = UTFCGR_ASCII;								\
		CLEN = 1;									\
	} else											\
		/* We have a UTF8 or BADCHAR type char - bytelen is set appropriately */	\
		CTYPE = ((UTF8_VALID((CPTR), (CTOPPTR), CLEN)) ? UTFCGR_UTF : UTFCGR_BADCHAR);	\
} MBEND

/* Define defaults and limits of the utfcgr structures. Note the defaults here are not chosen by any scientific
 * principles but are our current best guess at what will work for the largest group of customers.
 */
#define GTM_UTFCGR_STRINGS_DEFAULT	 50	/* Default max number of strings to cache scan results for ($gtm_utfcgr_strings) */
#define GTM_UTFCGR_STRINGS_MAX		 254	/* Value is a single byte and 255 is used as "invalid value" flag */
#define GTM_UTFCGR_STRING_GROUPS_DEFAULT 32	/* Default max char groups cached per string ($gtm_utfcgr_string_groups) */
#define UTFCGR_STRLEN_MIN		 33	/* Minimum (byte) length string that creates a cache */
#define UTFCGR_MAX_UTF_LEN (UTFCGR_STRLEN_MIN * 2)	/* Maximum byte length for a UTF8 group in cache - promotes scanning
							 * as have to scan at most this many bytes in a group - allows approx
							 * UTFCGR_STRLEN_MIN UTF8 chars in a string averaging 2 bytes each.
							 */
#define UTFCGR_MAXLOOK_DIVISOR		 5	/* Value to divide into TREF(gtm_utfcgr_strings) to get number of spins to locate
						 * an available cache line (skipping slots with reserve flag set) before we
						 * simply overwrite one. Used to compute TREF(utfcgr_string_lookmax).
						 */
/* Flags for utfcgr.entry[].typflags. If no flags set, group is of unknown type. Note there's no specific purpose for making these
 * actual bit flags - they could just become values but the thought was there might be additional flags in the future where
 * it would matter.
 */
#define UTFCGR_NONE	0x00			/* Group has an as-yet undefined type */
#define UTFCGR_ASCII	0x01			/* Group is all ASCII */
#define UTFCGR_BADCHAR	0x02			/* Group is BADCHAR(s) */
#define UTFCGR_UTF	0x04			/* Group is all UTF8 (no BADCHARs) */
#define UTFCGR_EOL	0x08			/* Used in utfscan_parseblk->char_type to indicate ran into EOL during scan */

/* Structure for each character group in a given string. The typflags field describes the group of characters that start after
 * the end of the last character group until the start of the next character group. All of the characters are of the same type.
 * The charcnt and byteidx fields describe the start (in character count and byte index) of the *next* group of characters.
 * So entry[0] gives the flags for entry[0] but the count/offset for the start of entry[1] when parsing.
 */
typedef struct utfcgr_entry_struct
{
	uint4	typflags:8;			/* Byte full 'o flag bits for THIS group */
	uint4	charcnt:24;			/* Total count of characters at start of next group */
	uint4	byteidx;			/* Offset in bytes from start of entire string to start of next group */
} utfcgr_entry;
/* Structure for each recently used string above the string length minimum that describes the string and
 * notes where its transition points are between BADCHAR, UTF8 and ASCII. We refer to these entries as
 * "groups" but note they are different than $PIECE() type pieces which have a given delimiter. Here,
 * the "groups" are delimited not by characters but by character TYPE.
 */
typedef struct utfcgr_struct
{
	mstr		last_str;		/* The last string (addr/len) we used in cache */
	unsigned short	ngrps;			/* Number of groups for which values are filled in */
	unsigned short	idx;			/* The index of this group in the entry[] array */
	boolean_t	reference;		/* Reference bit(s) to prevent overwrite if possible */
	utfcgr_entry	entry[1]; 		/* Table of  char groups for this string. This is a variable dimension
						 * field - dimension is in TREF(gtm_utfcgr_string_groups).
						 */
} utfcgr;
/* Structure for the entire allocation for UTF scan cache */
typedef struct
{
	utfcgr		*utfcgrsteal;		/* Last stolen cache element */
	utfcgr		*utfcgrmax;		/* (use addrs to avoid array indexing) */
	utfcgr		*utfcgrs;		/* Ptr to variable dimension array which has TREF(gtm_utfcgr_strings) entries */
	uint4		utfcgrsize;		/* Size of 1 utfcgr entry (varies depending on TREF(gtm_utfcgr_string_groups) */
} utfcgr_area;
/* This structure is for the scan descriptor used by the UTF scanning/parsing routines */
typedef struct
{
	mval		*mv;			/* Addr of mval this scan targets. Note this mval should be known to
						 * garbage collection so it is kept up-to-date across GC events (input but
						 * note mv may be updated with mv->utfcgr_indx field set).
						 */
	boolean_t	stoponbadchar;		/* TRUE - stops scan at badchar and returns with next two fields set
						 * FALSE - keeps scanning counting badchars as 1 byte (input)
						 */
	int		scan_byte_offset;	/* Byte offset (0 origin) where scan should start or where it ended (on return).
						 * If 0, starts/ended at beginning and next two fields are ignored. All 3 fields
						 * must be in sync or weird stuff can happen (input and output).
						 */
	int		scan_char_count;	/* Char count (1 origin) of the characters behind scan_byte_offset and does NOT
						 * include the character at that offset.
						 */
	int		utfcgr_indx;		/* utfcgr_entry index (0 origin) where scan should start or where it ended on
						 * return (input and output).
						 */
	int		scan_char_len;		/* Byte length of the character whose offset/index we are returning (input and
						 * output but not updated when scan_char_type is UTFCGR_EOL).
						 */
	int		scan_char_type;		/* Character type of returned character position (output) */
	unsigned char	*badcharstr;		/* Location of badchar if found while scanning (output). Note this field and the
						 * next only updated when scan_char_type is UTFCGR_BADCHAR and we return FALSE.
						 */
	unsigned char	*badchartop;		/* End of string to pass to utf8_badchar() (output) */
} utfscan_parseblk;

/* Entry point declarations */
utfcgr		*utfcgr_getcache(mval *mv);
boolean_t	utfcgr_scanforcharN(int char_num, utfscan_parseblk *utf_parse_blk);
#ifdef DEBUG
void		utfcgr_stats(void);
#endif

#endif