File: TextProcessor.java

package info (click to toggle)
libequinox-osgi-java 3.9.1-6
links: PTS, VCS
area: main
in suites: bookworm, forky, sid, trixie
size: 5,068 kB
sloc: java: 57,768; makefile: 9
file content (298 lines) | stat: -rw-r--r-- 10,379 bytes
parent folder | download | duplicates (7)
/*******************************************************************************
 * Copyright (c) 2006, 2008 IBM Corporation and others.
 * All rights reserved. This program and the accompanying materials 
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors:
 *     IBM Corporation - initial API and implementation
 *******************************************************************************/
package org.eclipse.osgi.util;

import java.util.Locale;

/**
 * This class is used to process strings that have special semantic meaning
 * (such as file paths) in RTL-oriented locales so that they render in a way
 * that does not corrupt the semantic meaning of the string but also maintains
 * compliance with the Unicode BiDi algorithm of rendering Bidirectional text.
 * <p>
 * Processing of the string is done by breaking it down into segments that are
 * specified by a set of user provided delimiters. Directional punctuation
 * characters are injected into the string in order to ensure the string retains
 * its semantic meaning and conforms with the Unicode BiDi algorithm within each
 * segment.
 * </p>
 * 
 * @since 3.2
 * @noextend This class is not intended to be subclassed by clients.
 */
public class TextProcessor {

	// commonly used delimiters
	/**
	 * Dot (.) delimiter. Used most often in package names and file extensions.
	 */
	private static final String DOT = "."; //$NON-NLS-1$

	/**
	 * Colon (:) delimiter. Used most often in file paths and URLs.
	 */
	private static final String COLON = ":"; //$NON-NLS-1$

	/**
	 * Forward slash (/) delimiter. Used most often in file paths and URLs.
	 */
	private static final String FILE_SEP_FSLASH = "/"; //$NON-NLS-1$

	/**
	 * Backslash (\) delimiter. Used most often in file paths.
	 */
	private static final String FILE_SEP_BSLASH = "\\"; //$NON-NLS-1$

	/**
	 * The default set of delimiters to use to segment a string.
	 */
	private static final String delimiterString = DOT + COLON + FILE_SEP_FSLASH + FILE_SEP_BSLASH;

	// left to right marker
	private static final char LRM = '\u200e';

	// left to right embedding
	private static final char LRE = '\u202a';

	// pop directional format
	private static final char PDF = '\u202c';

	// whether or not processing is needed
	private static boolean IS_PROCESSING_NEEDED = false;

	// constant used to indicate an LRM need not precede a delimiter 
	private static final int INDEX_NOT_SET = 999999999;

	static {
		Locale locale = Locale.getDefault();
		String lang = locale.getLanguage();

		if ("iw".equals(lang) || "he".equals(lang) || "ar".equals(lang) || "fa".equals(lang) || "ur".equals(lang)) { //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$
			String osName = System.getProperty("os.name").toLowerCase(); //$NON-NLS-1$
			if (osName.startsWith("windows") || osName.startsWith("linux") || osName.startsWith("mac")) { //$NON-NLS-1$	//$NON-NLS-2$ //$NON-NLS-3$
				IS_PROCESSING_NEEDED = true;
			}
		}
	}

	/**
	 * Process the given text and return a string with the appropriate
	 * substitution based on the locale. This is equivalent to calling
	 * <code>process(String, String)</code> with the default set of
	 * delimiters.
	 * 
	 * @param text
	 *            the text to be processed
	 * @return the manipulated string
	 * @see #process(String, String)
	 * @see #getDefaultDelimiters()
	 */
	public static String process(String text) {
		if (!IS_PROCESSING_NEEDED || text == null || text.length() <= 1)
			return text;
		return process(text, getDefaultDelimiters());
	}

	/**
	 * Process a string that has a particular semantic meaning to render on BiDi
	 * locales in way that maintains the semantic meaning of the text, but
	 * differs from the Unicode BiDi algorithm. The text is segmented according
	 * to the provided delimiters. Each segment has the Unicode BiDi algorithm
	 * applied to it, but as a whole, the string is oriented left to right.
	 * <p>
	 * For example a file path such as <tt>d:\myFolder\FOLDER\MYFILE.java</tt>
	 * (where capital letters indicate RTL text) should render as
	 * <tt>d:\myFolder\REDLOF\ELIFYM.java</tt> when using the Unicode BiDi
	 * algorithm and segmenting the string according to the specified delimiter
	 * set.
	 * </p>
	 * <p>
	 * The following algorithm is used:
	 * <ol>
	 * <li>Scan the string to locate the delimiters.</li>
	 * <li>While scanning, note the direction of the last strong character
	 * scanned. Strong characters are characters which have a BiDi
	 * classification of L, R or AL as defined in the Unicode standard.</li>
	 * <li>If the last strong character before a separator is of class R or AL,
	 * add a LRM before the separator. Since LRM itself is a strong L character,
	 * following separators do not need an LRM until a strong R or AL character
	 * is found.</li>
	 * <li>If the component where the pattern is displayed has a RTL basic
	 * direction, add a LRE at the beginning of the pattern and a PDF at its
	 * end. The string is considered to have RTL direction if it contains RTL
	 * characters and the runtime locale is BiDi. There is no need to add
	 * LRE/PDF if the string begins with an LTR letter, contains no RTL letter,
	 * and ends with either a LTR letter or a digit.</li>
	 * </ol>
	 * </p>
	 * <p>
	 * NOTE: this method will change the shape of the original string passed in
	 * by inserting punctuation characters into the text in order to make it
	 * render to correctly reflect the semantic meaning of the text. Methods
	 * like <code>String.equals(String)</code> and
	 * <code>String.length()</code> called on the resulting string will not
	 * return the same values as would be returned for the original string.
	 * </p>
	 * 
	 * @param str
	 *            the text to process, if <code>null</code> return the string
	 *            as it was passed in
	 * @param delimiter
	 *            delimiters by which the string will be segmented, if
	 *            <code>null</code> the default delimiters are used
	 * @return the processed string
	 */
	public static String process(String str, String delimiter) {
		if (!IS_PROCESSING_NEEDED || str == null || str.length() <= 1)
			return str;

		// do not process a string that has already been processed.
		if (str.charAt(0) == LRE && str.charAt(str.length() - 1) == PDF) {
			return str;
		}

		// String contains RTL characters
		boolean isStringBidi = false;
		// Last strong character is RTL
		boolean isLastRTL = false;
		// Last candidate delimiter index
		int delimIndex = INDEX_NOT_SET;

		delimiter = delimiter == null ? getDefaultDelimiters() : delimiter;

		StringBuffer target = new StringBuffer();
		target.append(LRE);
		char ch;

		for (int i = 0, n = str.length(); i < n; i++) {
			ch = str.charAt(i);
			if (delimiter.indexOf(ch) != -1) {
				// character is a delimiter, note its index in the buffer
				if (isLastRTL) {
					delimIndex = target.length();
				}
			} else if (Character.isDigit(ch)) {
				if (delimIndex != INDEX_NOT_SET) {
					// consecutive neutral and weak directional characters
					// explicitly force direction to be LRM					
					target.insert(delimIndex, LRM);
					delimIndex = INDEX_NOT_SET;
					isLastRTL = false;
				}
			} else if (Character.isLetter(ch)) {
				if (isRTL(ch)) {
					isStringBidi = true;
					if (delimIndex != INDEX_NOT_SET) {
						// neutral character followed by strong right directional character
						// explicitly force direction to be LRM	
						target.insert(delimIndex, LRM);
						delimIndex = INDEX_NOT_SET;
					}
					isLastRTL = true;
				} else {
					// strong LTR character, no LRM will be required
					delimIndex = INDEX_NOT_SET;
					isLastRTL = false;
				}
			}
			target.append(ch);
		}
		/*
		 * TextProcessor is not aware of the orientation of the component owning
		 * the processed string. Enclose the string in LRE/PDF in either of 2
		 * cases: 
		 * (1) The string contains BiDi characters - implying that the
		 * string appearance depends on the basic orientation 
		 * (2) The runtime locale is BiDi AND either the string does not start with 
		 * an LTR character or it ends with LTR char or digit.
		 */
		if (isStringBidi || !Character.isLetter(str.charAt(0)) || isNeutral(str.charAt(str.length() - 1))) {
			target.append(PDF);
			return target.toString();
		}
		// Otherwise, return the original string
		return str;
	}

	/**
	 * Removes directional marker characters in the given string that were inserted by 
	 * utilizing the <code>process(String)</code> or <code>process(String, String)</code>
	 * methods.
	 * 
	 * @param str string with directional markers to remove
	 * @return string with no directional markers 
	 * @see #process(String)
	 * @see #process(String, String)
	 * @since 3.3
	 */
	public static String deprocess(String str) {
		if (!IS_PROCESSING_NEEDED || str == null || str.length() <= 1)
			return str;

		StringBuffer buf = new StringBuffer();
		for (int i = 0; i < str.length(); i++) {
			char c = str.charAt(i);
			switch (c) {
				case LRE :
					continue;
				case PDF :
					continue;
				case LRM :
					continue;
				default :
					buf.append(c);
			}
		}

		return buf.toString();
	}

	/**
	 * Return the string containing all the default delimiter characters to be
	 * used to segment a given string.
	 * 
	 * @return delimiter string
	 */
	public static String getDefaultDelimiters() {
		return delimiterString;
	}

	/*
	 * Return whether or not the character falls is right to left oriented.
	 */
	private static boolean isRTL(char c) {
		/*
		 * Cannot use Character.getDirectionality() since the OSGi library can
		 * be compiled with execution environments that pre-date that API.
		 * 
		 * The first range of characters is Unicode Hebrew and Arabic
		 * characters. The second range of characters is Unicode Hebrew and
		 * Arabic presentation forms.
		 * 
		 * NOTE: Farsi and Urdu fall within the Arabic scripts.
		 */
		return (((c >= 0x05d0) && (c <= 0x07b1)) || ((c >= 0xfb1d) && (c <= 0xfefc)));
	}

	/*
	 * Return whether or not the given character has a weak directional type
	 */
	private static boolean isNeutral(char c) {
		return !(Character.isDigit(c) || Character.isLetter(c));
	}

	/*
	 * Constructor for the class.
	 */
	private TextProcessor() {
		// prevent instantiation
	}
}