File: aaplbfct.cpp

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (392 lines) | stat: -rw-r--r-- 12,888 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
/**
 *******************************************************************************
 * Copyright (C) 2007,2012 International Business Machines Corporation, Apple Inc.,*
 * and others.  All Rights Reserved.  
 *                                         *
 * originally added per rdar://4448220 Add user dictionary support
 *******************************************************************************
 */

#define __STDC_LIMIT_MACROS 1
#include <_foundation_unicode/utypes.h>

#if !UCONFIG_NO_BREAK_ITERATION && U_PLATFORM_IS_DARWIN_BASED

#include "brkeng.h"
#include "dictbe.h"
#include "aaplbfct.h"
#include <_foundation_unicode/uscript.h>
#include <_foundation_unicode/uniset.h>
#include <_foundation_unicode/ucnv.h>
#include <_foundation_unicode/uchar.h>
#include <limits.h>
#include <unistd.h>
#include <glob.h>
#include <strings.h>
#include <NSSystemDirectories.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdint.h>
// The following is now already included by platform.h (included indirectly by
// utypes.h) if U_PLATFORM_IS_DARWIN_BASED but it doesn't hurt to re-include here
#include <TargetConditionals.h>

U_NAMESPACE_BEGIN

/*
 ******************************************************************
 */

AppleLanguageBreakFactory::AppleLanguageBreakFactory(UErrorCode &status)
: ICULanguageBreakFactory(status)
{
}

AppleLanguageBreakFactory::~AppleLanguageBreakFactory() {
}

#if !TARGET_OS_EMBEDDED
#if 0
// need to update loadDictionaryMatcherFor implementation below

// Helper function that makes a length-delimited buffer look NUL-terminated
static __attribute__((always_inline)) inline UChar nextUChar(const UChar *&p, ptrdiff_t &l) {
	if (l > 0) {
		l -= 1;
		return *p++;
	}
	else {
		return 0;
	}
}

// Add a file's worth of words to the supplied mutable dictionary
static void addDictFile(MutableTrieDictionary *to, const char *path) {
	UErrorCode status = U_ZERO_ERROR;
	off_t fileLength;
	const char *dictRawData = (const char *) -1;
	const UChar *dictData = NULL;
	ptrdiff_t dictDataLength = 0;
	UChar *dictBuffer = NULL;
	const char *encoding = NULL;
	int32_t		signatureLength = 0;
	
	// Open the dictionary file
	int dictFile = open(path, O_RDONLY, 0);
	if (dictFile == -1) {
		status = U_FILE_ACCESS_ERROR;
	}
	
	// Determine its length
	if (U_SUCCESS(status)) {
		fileLength = lseek(dictFile, 0, SEEK_END);
		(void) lseek(dictFile, 0, SEEK_SET);
		if (fileLength < 0 || fileLength > PTRDIFF_MAX) {
			status = U_FILE_ACCESS_ERROR;
		}
	}
	
	// Map it
	if (U_SUCCESS(status)) {
		dictRawData = (const char *) mmap(0, (size_t) fileLength, PROT_READ, MAP_SHARED, dictFile, 0);
		if ((intptr_t)dictRawData == -1) {
			status = U_FILE_ACCESS_ERROR;
		}
	}
	
	// No longer need the file descriptor open
	if (dictFile != -1) {
		(void) close(dictFile);
	}
	
	// Look for a Unicode signature
	if (U_SUCCESS(status)) {
		encoding = ucnv_detectUnicodeSignature(dictRawData, fileLength, &signatureLength, &status);
	}
	
	// If necessary, convert the data to UChars
	if (U_SUCCESS(status) && encoding != NULL) {
		UConverter *conv = ucnv_open(encoding, &status);
		// Preflight to get buffer size
		uint32_t destCap = ucnv_toUChars(conv, NULL, 0, dictRawData, fileLength, &status);
		if (status == U_BUFFER_OVERFLOW_ERROR) {
			status = U_ZERO_ERROR;
		}
		if (U_SUCCESS(status)) {
			dictBuffer = new UChar[destCap+1];
		}
		(void) ucnv_toUChars(conv, dictBuffer, destCap+1, dictRawData, fileLength, &status);
		dictData = dictBuffer;
		dictDataLength = destCap;
		if (U_SUCCESS(status) && dictData[0] == 0xFEFF) {	// BOM? Skip it
			dictData += 1;
			dictDataLength -= 1;
		}
		
		ucnv_close(conv);
	}
	
	// If it didn't need converting, just assume it's native-endian UTF-16, no BOM
	if (U_SUCCESS(status) && dictData == NULL) {
		dictData = (const UChar *) dictRawData;
		dictDataLength = fileLength/sizeof(UChar);
	}
	
	// OK, we now have a pointer to native-endian UTF-16. Process it as one word per line,
	// stopping at the first space.
	if (U_SUCCESS(status)) {
		UnicodeSet breaks(UNICODE_STRING_SIMPLE("[[:lb=BK:][:lb=CR:][:lb=LF:][:lb=NL:]]"), status);
		const UChar *candidate = dictData;
		int32_t length = 0;
		UChar uc = nextUChar(dictData, dictDataLength);
		while (U_SUCCESS(status) && uc) {
			while (uc && !u_isspace(uc)) {
				length += 1;
				uc = nextUChar(dictData, dictDataLength);
			}
			
			if (length > 0) {
				to->addWord(candidate, length, status);
			}
			
			// Find beginning of next line
			// 1. Skip non-line-break characters
			while (uc && !breaks.contains(uc)) {
				uc = nextUChar(dictData, dictDataLength);
			}
			// 2. Skip line break characters
			while (uc && breaks.contains(uc)) {
				uc = nextUChar(dictData, dictDataLength);
			}
			
			// Prepare for next line
			candidate = dictData-1;
			length = 0;
		}
	}

	// Unmap the file if we mapped it
	if ((intptr_t) dictRawData != -1) {
		(void) munmap((void *)dictRawData, (size_t) fileLength);
	}
	
	// Delete any temporary buffer
	delete [] dictBuffer;
}

#if U_IS_BIG_ENDIAN
	static const char	sArchType[] = "";
#else
	static const char	sArchType[] = ".le";	// little endian
#endif

#endif
#endif

/*
In ICU50,
ICULanguageBreakFactory changes from 
  virtual const CompactTrieDictionary *loadDictionaryFor(UScriptCode script, int32_t breakType);
to
  virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType);
and CompactTrieDictionary no longer exists. Need to work out  new implementation below.
*/

DictionaryMatcher *
AppleLanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
	DictionaryMatcher *icuDictMatcher = ICULanguageBreakFactory::loadDictionaryMatcherFor(script);
#if !TARGET_OS_EMBEDDED
#if 0
// need to update loadDictionaryMatcherFor implementation below
	// We only look for a user dictionary if there is actually an ICU dictionary
	if (icuDictMatcher != NULL) {
		UErrorCode status = U_ZERO_ERROR;
		const char *scriptName = uscript_getName(script);
		char path[256];			// PATH_MAX is overkill in this case
		char cachePath[128];
		char cacheTargetPath[256];
		glob_t dirGlob;
		glob_t fileGlob;
		struct stat cacheStat;
		struct stat dictStat;
		bool cacheGood = true;
		int globFlags = (GLOB_NOESCAPE|GLOB_NOSORT|GLOB_TILDE);
		const CompactTrieDictionary *cacheDict = NULL;
		
		// Iterate the dictionary directories and accumulate in dirGlob
		NSSearchPathEnumerationState state = NSStartSearchPathEnumeration(NSLibraryDirectory, (NSSearchPathDomainMask) (NSUserDomainMask|NSLocalDomainMask|NSNetworkDomainMask));
		while ((state = NSGetNextSearchPathEnumeration(state, path)) != 0) {
			// First get the directory itself. We should never overflow, but use strlcat anyway
			// to avoid a crash if we do.
			strlcat(path, "/Dictionaries", sizeof(path));
			if (!glob(path, globFlags, NULL, &dirGlob)) {
				globFlags |= GLOB_APPEND;
			}
		}
		
		// If there are no Dictionaries directories, ignore any cache file and return the ICU
		// standard dictionary
		// TODO: Delete the cache?
		if (dirGlob.gl_pathc == 0) {
			globfree(&dirGlob);
			return icuDictMatcher;
		}
		
		// See if there is a cache file already; get its mod time
		// TODO: should we be using geteuid() here instead of getuid()?
		state = NSStartSearchPathEnumeration(NSCachesDirectory, NSLocalDomainMask);
		state = NSGetNextSearchPathEnumeration(state, cachePath);	// Just use first one
		// Create the cache file name. We should never overflow, but use snprintf to avoid a crash
		// if we do.
		snprintf(cacheTargetPath, sizeof(cacheTargetPath), "%s/com.apple.ICUUserDictionaryCache%s.%s.%d", cachePath, sArchType, scriptName, getuid());
		if (stat(cacheTargetPath, &cacheStat) || cacheStat.st_mode != (S_IFREG|S_IRUSR|S_IWUSR)) {
			cacheGood = false;		// No file or bad permissions or type
		}
		
		// Stat the dictionary folders, and glob the dictionary files
		globFlags &= ~GLOB_APPEND;
		char **pathsp = dirGlob.gl_pathv;
		const char *dictpath;
		while ((dictpath = *pathsp++) != NULL) {
			// Stat the directory -- ignore if stat failure
			if (!stat(dictpath, &dictStat)) {
				// Glob the dictionaries in the directory
				snprintf(path, sizeof(path), "%s/*-%s.txt", dictpath, scriptName);
				if (!glob(path, globFlags, NULL, &fileGlob)) {
					globFlags |= GLOB_APPEND;
				}
				// If the directory has been modified after the cache file, we need to rebuild;
				// a dictionary might have been deleted.
				if (cacheGood && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) {
					cacheGood = false;
				}
			}
		}
		
		// No longer need the directory glob
		globfree(&dirGlob);
		
		// If there are no dictionaries, ignore the cache file and return the ICU dictionary
		// TODO: Delete the cache?
		if (fileGlob.gl_pathc == 0) {
			globfree(&fileGlob);
			return icuDictMatcher;
		}
		
		// Now compare the last modified stamp for the cache against all the dictionaries
		pathsp = fileGlob.gl_pathv;
		while (cacheGood && (dictpath = *pathsp++)) {
			// Stat the dictionary -- ignore if stat failure
			if (!stat(dictpath, &dictStat) && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) {
				cacheGood = false;
			}
		}
		
		// Do we need to build the dictionary cache?
		if (!cacheGood) {
			// Create a mutable dictionary from the ICU dictionary
			MutableTrieDictionary *sum = icuDictMatcher->cloneMutable(status);
			pathsp = fileGlob.gl_pathv;
			while (U_SUCCESS(status) && (dictpath = *pathsp++)) {
				// Add the contents of a file to the sum
				addDictFile(sum, dictpath);
			}
			
			// Create a compact (read-only) dictionary
			CompactTrieDictionary compact(*sum, status);
			delete sum;
			
			if (U_SUCCESS(status)) {
				// Open a temp file to write out the cache
				strlcat(cachePath, "/temp.XXXXXXXXXX", sizeof(cachePath));
				int temp = mkstemp(cachePath);
				if (temp == -1) {
					status = U_FILE_ACCESS_ERROR;
				}
				size_t dictSize = compact.dataSize();
				if (U_SUCCESS(status) && write(temp, compact.data(), dictSize) != dictSize) {
					status = U_FILE_ACCESS_ERROR;
				}
				// Rename the temp file to the cache. Note that race conditions here are
				// fine, as the file system operations are atomic. If an outdated version wins
				// over a newer version, it will get rebuilt at the next app launch due to the
				// modification time checks above. We don't care that any given app launch gets
				// the most up-to-date cache (impossible since we can't lock all the Dictionaries
				// directories), only that the cache (eventually) reflects the current state of
				// any user dictionaries. That will happen on the next app launch after changes
				// to the user dictionaries quiesce.
				if (U_SUCCESS(status)) {
					if (rename(cachePath, cacheTargetPath)) {
						status = U_FILE_ACCESS_ERROR;
						(void) unlink(cachePath);	// Clean up the temp file
					}
				}
				if (temp != -1) {
					close(temp);
				}
			}
		}

		// Done with dictionary paths; release memory allocated by glob()
		globfree(&fileGlob);
		
		// Map the cache and build the dictionary
		if (U_SUCCESS(status)) {
			int cache = open(cacheTargetPath, O_RDONLY, 0);
			off_t length;
			const void *cacheData = (const void *) -1;
			if (cache == -1) {
				status = U_FILE_ACCESS_ERROR;
			}
			if (U_SUCCESS(status)) {
				length = lseek(cache, 0, SEEK_END);
				(void) lseek(cache, 0, SEEK_SET);
				if (length < 0 || length > PTRDIFF_MAX) {
					status = U_FILE_ACCESS_ERROR;
				}
			}
			
			// Map the cache. Note: it is left mapped until process exit. This is the normal
			// behavior anyway, so it shouldn't be an issue.
			if (U_SUCCESS(status)) {
				cacheData = mmap(0, (size_t) length, PROT_READ, MAP_SHARED, cache, 0);
				if ((intptr_t)cacheData == -1) {
					status = U_FILE_ACCESS_ERROR;
				}
			}
			// We can close the cache file now that it's mapped (or not)
			if (cache != -1) {
				(void) close(cache);
			}
			// If all was successful, try to create the dictionary. The constructor will
			// check the magic number for us.
			if (U_SUCCESS(status)) {
				cacheDict = new CompactTrieDictionary(cacheData, status);
			}
			if (U_FAILURE(status) && (intptr_t)cacheData != -1) {
				// Clean up the mmap
				(void) munmap((void *)cacheData, (size_t) length);
			}
		}
		
		// If we were successful, free the ICU dictionary and return ours
		if (U_SUCCESS(status)) {
			delete icuDictMatcher;
			return cacheDict;
		}
		else {
			delete cacheDict;
		}
	}
#endif
#endif
	return icuDictMatcher;
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_BREAK_ITERATION && U_PLATFORM_IS_DARWIN_BASED */