1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298
|
/*******************************************************************************
* Copyright (c) 2006, 2008 IBM Corporation and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* IBM Corporation - initial API and implementation
*******************************************************************************/
package org.eclipse.osgi.util;
import java.util.Locale;
/**
* This class is used to process strings that have special semantic meaning
* (such as file paths) in RTL-oriented locales so that they render in a way
* that does not corrupt the semantic meaning of the string but also maintains
* compliance with the Unicode BiDi algorithm of rendering Bidirectional text.
* <p>
* Processing of the string is done by breaking it down into segments that are
* specified by a set of user provided delimiters. Directional punctuation
* characters are injected into the string in order to ensure the string retains
* its semantic meaning and conforms with the Unicode BiDi algorithm within each
* segment.
* </p>
*
* @since 3.2
* @noextend This class is not intended to be subclassed by clients.
*/
public class TextProcessor {
// commonly used delimiters
/**
* Dot (.) delimiter. Used most often in package names and file extensions.
*/
private static final String DOT = "."; //$NON-NLS-1$
/**
* Colon (:) delimiter. Used most often in file paths and URLs.
*/
private static final String COLON = ":"; //$NON-NLS-1$
/**
* Forward slash (/) delimiter. Used most often in file paths and URLs.
*/
private static final String FILE_SEP_FSLASH = "/"; //$NON-NLS-1$
/**
* Backslash (\) delimiter. Used most often in file paths.
*/
private static final String FILE_SEP_BSLASH = "\\"; //$NON-NLS-1$
/**
* The default set of delimiters to use to segment a string.
*/
private static final String delimiterString = DOT + COLON + FILE_SEP_FSLASH + FILE_SEP_BSLASH;
// left to right marker
private static final char LRM = '\u200e';
// left to right embedding
private static final char LRE = '\u202a';
// pop directional format
private static final char PDF = '\u202c';
// whether or not processing is needed
private static boolean IS_PROCESSING_NEEDED = false;
// constant used to indicate an LRM need not precede a delimiter
private static final int INDEX_NOT_SET = 999999999;
static {
Locale locale = Locale.getDefault();
String lang = locale.getLanguage();
if ("iw".equals(lang) || "he".equals(lang) || "ar".equals(lang) || "fa".equals(lang) || "ur".equals(lang)) { //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$
String osName = System.getProperty("os.name").toLowerCase(); //$NON-NLS-1$
if (osName.startsWith("windows") || osName.startsWith("linux") || osName.startsWith("mac")) { //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
IS_PROCESSING_NEEDED = true;
}
}
}
/**
* Process the given text and return a string with the appropriate
* substitution based on the locale. This is equivalent to calling
* <code>process(String, String)</code> with the default set of
* delimiters.
*
* @param text
* the text to be processed
* @return the manipulated string
* @see #process(String, String)
* @see #getDefaultDelimiters()
*/
public static String process(String text) {
if (!IS_PROCESSING_NEEDED || text == null || text.length() <= 1)
return text;
return process(text, getDefaultDelimiters());
}
/**
* Process a string that has a particular semantic meaning to render on BiDi
* locales in way that maintains the semantic meaning of the text, but
* differs from the Unicode BiDi algorithm. The text is segmented according
* to the provided delimiters. Each segment has the Unicode BiDi algorithm
* applied to it, but as a whole, the string is oriented left to right.
* <p>
* For example a file path such as <tt>d:\myFolder\FOLDER\MYFILE.java</tt>
* (where capital letters indicate RTL text) should render as
* <tt>d:\myFolder\REDLOF\ELIFYM.java</tt> when using the Unicode BiDi
* algorithm and segmenting the string according to the specified delimiter
* set.
* </p>
* <p>
* The following algorithm is used:
* <ol>
* <li>Scan the string to locate the delimiters.</li>
* <li>While scanning, note the direction of the last strong character
* scanned. Strong characters are characters which have a BiDi
* classification of L, R or AL as defined in the Unicode standard.</li>
* <li>If the last strong character before a separator is of class R or AL,
* add a LRM before the separator. Since LRM itself is a strong L character,
* following separators do not need an LRM until a strong R or AL character
* is found.</li>
* <li>If the component where the pattern is displayed has a RTL basic
* direction, add a LRE at the beginning of the pattern and a PDF at its
* end. The string is considered to have RTL direction if it contains RTL
* characters and the runtime locale is BiDi. There is no need to add
* LRE/PDF if the string begins with an LTR letter, contains no RTL letter,
* and ends with either a LTR letter or a digit.</li>
* </ol>
* </p>
* <p>
* NOTE: this method will change the shape of the original string passed in
* by inserting punctuation characters into the text in order to make it
* render to correctly reflect the semantic meaning of the text. Methods
* like <code>String.equals(String)</code> and
* <code>String.length()</code> called on the resulting string will not
* return the same values as would be returned for the original string.
* </p>
*
* @param str
* the text to process, if <code>null</code> return the string
* as it was passed in
* @param delimiter
* delimiters by which the string will be segmented, if
* <code>null</code> the default delimiters are used
* @return the processed string
*/
public static String process(String str, String delimiter) {
if (!IS_PROCESSING_NEEDED || str == null || str.length() <= 1)
return str;
// do not process a string that has already been processed.
if (str.charAt(0) == LRE && str.charAt(str.length() - 1) == PDF) {
return str;
}
// String contains RTL characters
boolean isStringBidi = false;
// Last strong character is RTL
boolean isLastRTL = false;
// Last candidate delimiter index
int delimIndex = INDEX_NOT_SET;
delimiter = delimiter == null ? getDefaultDelimiters() : delimiter;
StringBuffer target = new StringBuffer();
target.append(LRE);
char ch;
for (int i = 0, n = str.length(); i < n; i++) {
ch = str.charAt(i);
if (delimiter.indexOf(ch) != -1) {
// character is a delimiter, note its index in the buffer
if (isLastRTL) {
delimIndex = target.length();
}
} else if (Character.isDigit(ch)) {
if (delimIndex != INDEX_NOT_SET) {
// consecutive neutral and weak directional characters
// explicitly force direction to be LRM
target.insert(delimIndex, LRM);
delimIndex = INDEX_NOT_SET;
isLastRTL = false;
}
} else if (Character.isLetter(ch)) {
if (isRTL(ch)) {
isStringBidi = true;
if (delimIndex != INDEX_NOT_SET) {
// neutral character followed by strong right directional character
// explicitly force direction to be LRM
target.insert(delimIndex, LRM);
delimIndex = INDEX_NOT_SET;
}
isLastRTL = true;
} else {
// strong LTR character, no LRM will be required
delimIndex = INDEX_NOT_SET;
isLastRTL = false;
}
}
target.append(ch);
}
/*
* TextProcessor is not aware of the orientation of the component owning
* the processed string. Enclose the string in LRE/PDF in either of 2
* cases:
* (1) The string contains BiDi characters - implying that the
* string appearance depends on the basic orientation
* (2) The runtime locale is BiDi AND either the string does not start with
* an LTR character or it ends with LTR char or digit.
*/
if (isStringBidi || !Character.isLetter(str.charAt(0)) || isNeutral(str.charAt(str.length() - 1))) {
target.append(PDF);
return target.toString();
}
// Otherwise, return the original string
return str;
}
/**
* Removes directional marker characters in the given string that were inserted by
* utilizing the <code>process(String)</code> or <code>process(String, String)</code>
* methods.
*
* @param str string with directional markers to remove
* @return string with no directional markers
* @see #process(String)
* @see #process(String, String)
* @since 3.3
*/
public static String deprocess(String str) {
if (!IS_PROCESSING_NEEDED || str == null || str.length() <= 1)
return str;
StringBuffer buf = new StringBuffer();
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
switch (c) {
case LRE :
continue;
case PDF :
continue;
case LRM :
continue;
default :
buf.append(c);
}
}
return buf.toString();
}
/**
* Return the string containing all the default delimiter characters to be
* used to segment a given string.
*
* @return delimiter string
*/
public static String getDefaultDelimiters() {
return delimiterString;
}
/*
* Return whether or not the character falls is right to left oriented.
*/
private static boolean isRTL(char c) {
/*
* Cannot use Character.getDirectionality() since the OSGi library can
* be compiled with execution environments that pre-date that API.
*
* The first range of characters is Unicode Hebrew and Arabic
* characters. The second range of characters is Unicode Hebrew and
* Arabic presentation forms.
*
* NOTE: Farsi and Urdu fall within the Arabic scripts.
*/
return (((c >= 0x05d0) && (c <= 0x07b1)) || ((c >= 0xfb1d) && (c <= 0xfefc)));
}
/*
* Return whether or not the given character has a weak directional type
*/
private static boolean isNeutral(char c) {
return !(Character.isDigit(c) || Character.isLetter(c));
}
/*
* Constructor for the class.
*/
private TextProcessor() {
// prevent instantiation
}
}
|