File: RtfTokeniser.java

package info (click to toggle)
libitext-java 1.4.5-3
links: PTS
area: main
in suites: lenny
size: 7,748 kB
ctags: 12,714
sloc: java: 88,264; xml: 305; makefile: 19
file content (240 lines) | stat: -rw-r--r-- 9,469 bytes
parent folder | download | duplicates (2)
/**
 * $Id: RtfTokeniser.java,v 1.3 2006/08/28 11:21:15 blowagie Exp $
 * $Name:  $
 *
 * Copyright 2006 by Mark Hall
 *
 * The contents of this file are subject to the Mozilla Public License Version 1.1
 * (the "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the License.
 *
 * The Original Code is 'iText, a free JAVA-PDF library'.
 *
 * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
 * the Initial Developer are Copyright (C) 1999-2006 by Bruno Lowagie.
 * All Rights Reserved.
 * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
 * are Copyright (C) 2000-2006 by Paulo Soares. All Rights Reserved.
 *
 * Contributor(s): all the names of the contributors are added in the source code
 * where applicable.
 *
 * Alternatively, the contents of this file may be used under the terms of the
 * LGPL license (the ?GNU LIBRARY GENERAL PUBLIC LICENSE?), in which case the
 * provisions of LGPL are applicable instead of those above.  If you wish to
 * allow use of your version of this file only under the terms of the LGPL
 * License and not to allow others to use your version of this file under
 * the MPL, indicate your decision by deleting the provisions above and
 * replace them with the notice and other provisions required by the LGPL.
 * If you do not delete the provisions above, a recipient may use your version
 * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
 *
 * This library is free software; you can redistribute it and/or modify it
 * under the terms of the MPL as stated above or under the terms of the GNU
 * Library General Public License as published by the Free Software Foundation;
 * either version 2 of the License, or any later version.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
 * details.
 *
 * If you didn't download this code from the following link, you should check if
 * you aren't using an obsolete version:
 * http://www.lowagie.com/iText/
 */

package com.lowagie.text.rtf.direct;

import java.io.IOException;
import java.io.Reader;

import com.lowagie.text.DocumentException;

/**
 * The RtfTokeniser takes an RTF document stream and
 * turns it into a set of RTF tokens. Five groups of
 * tokens are differentiated:
 * 
 * <ul>
 *   <li>Group opening: {</li>
 *   <li>Group closing: }</li>
 *   <li>Control characters</li>
 *   <li>Control words</li>
 *   <li>Text</li>
 * </ul>
 * 
 * @version $Revision: 1.3 $
 * @author Mark Hall (mhall@edu.uni-klu.ac.at)
 */
public class RtfTokeniser {
	/**
	 * The RtfTokeniser is in its ground state. Any token may follow.
	 */
	private static final int TOKENISER_STATE_READY = 0;
	/**
	 * The last token parsed was a slash.
	 */
	private static final int TOKENISER_STATE_SLASH = 1;
	/**
	 * The RtfTokeniser is currently tokenising a control word.
	 */
	private static final int TOKENISER_STATE_IN_CTRL_WORD = 2;
	/**
	 * The RtfTokeniser is currently tokenising a text.
	 */
	private static final int TOKENISER_STATE_IN_TEXT = 4;
	
	/**
	 * The current state of this RtfTokeniser.
	 */
	private int state = TOKENISER_STATE_READY;
	/**
	 * The current group nesting level.
	 */
	private int groupLevel = 0;
	/**
	 * The RtfParser to send tokens to.
	 */
	private RtfParser rtfParser = null;

	/**
	 * Constructs a new RtfTokeniser. The startGroupLevel is required when parsing
	 * RTF fragments, since they are missing the opening group and closing group
	 * and thus this has to be set at the beginning.
	 * 
	 * @param rtfParser The RtfParser to send tokens to.
	 * @param startGroupLevel The starting group nesting level. 0 for full documents, 1 for fragments.
	 */
	public RtfTokeniser(RtfParser rtfParser, int startGroupLevel) {
		this.rtfParser = rtfParser;
		this.groupLevel = startGroupLevel;
	}
	
	/**
	 * The main tokenisation method. Implements a LL(1) parser.
	 * 
	 * @param reader The Reader to read the RTF document from.
	 * @throws IOException On I/O errors.
	 * @throws DocumentException On errors writing the document.
	 */
	public void tokenise(Reader reader) throws IOException, DocumentException {
		char[] nextChar = new char[1];
		String temp = "";
		this.state = TOKENISER_STATE_READY;
		this.groupLevel = 0;
		while(reader.read(nextChar) != -1) {
			if(this.state == TOKENISER_STATE_READY) { // No influence from previous characters.
				if(nextChar[0] == '{') { // Open a group
					this.rtfParser.handleOpenGroup(this.groupLevel);
					groupLevel++;
				} else if(nextChar[0] == '}') { // Close a group
					this.rtfParser.handleCloseGroup(this.groupLevel);
					groupLevel--;
				} else if(nextChar[0] == '\\') {
					this.state = TOKENISER_STATE_SLASH;
					temp = "";
				} else {
					this.state = TOKENISER_STATE_IN_TEXT;
					temp = temp + nextChar[0];
				}
			} else if((this.state & TOKENISER_STATE_SLASH) == TOKENISER_STATE_SLASH) { // A slash signals a control character or word or an escaped character
				if(nextChar[0] == '{') {
					this.state = TOKENISER_STATE_IN_TEXT;
					temp = temp + "\\{";
				} else if(nextChar[0] == '}') {
					this.state = TOKENISER_STATE_IN_TEXT;
					temp = temp + "\\}";
				} else if(nextChar[0] == '\\') {
					this.state = TOKENISER_STATE_IN_TEXT;
					temp = temp + "\\\\";
				} else {
					if((this.state & TOKENISER_STATE_IN_TEXT) == TOKENISER_STATE_IN_TEXT) { // A control word or character closes previous text token
						this.rtfParser.handleText(temp, this.groupLevel);
						temp = "";
					}
					if(nextChar[0] == '|') {
						this.state = TOKENISER_STATE_READY;
						this.rtfParser.handleCtrlCharacter("\\|", this.groupLevel);
					} else if(nextChar[0] == '~') {
						this.state = TOKENISER_STATE_READY;
						this.rtfParser.handleCtrlCharacter("\\~", this.groupLevel);
					} else if(nextChar[0] == '-') {
						this.state = TOKENISER_STATE_READY;
						this.rtfParser.handleCtrlCharacter("\\-", this.groupLevel);
					} else if(nextChar[0] == '_') {
						this.state = TOKENISER_STATE_READY;
						this.rtfParser.handleCtrlCharacter("\\_", this.groupLevel);
					} else if(nextChar[0] == ':') {
						this.state = TOKENISER_STATE_READY;
						this.rtfParser.handleCtrlCharacter("\\:", this.groupLevel);
					} else if(nextChar[0] == '*') {
						this.state = TOKENISER_STATE_READY;
						this.rtfParser.handleCtrlCharacter("\\*", this.groupLevel);
					} else {
						this.state = TOKENISER_STATE_IN_CTRL_WORD;
						temp = "\\" + nextChar[0];
					}
				}
			} else if(this.state == TOKENISER_STATE_IN_CTRL_WORD) { // Control words run until a space, close or open group or another control word is found.
				if(nextChar[0] == '\n' || nextChar[0] == '\r') {
					nextChar[0] = ' ';
				}
				if(nextChar[0] == '{') {
					this.rtfParser.handleCtrlWord(temp, this.groupLevel);
					this.rtfParser.handleOpenGroup(this.groupLevel);
					groupLevel++;
					this.state = TOKENISER_STATE_READY;
					temp = "";
				} else if(nextChar[0] == '}') {
					this.rtfParser.handleCtrlWord(temp, this.groupLevel);
					this.rtfParser.handleCloseGroup(this.groupLevel);
					groupLevel--;
					this.state = TOKENISER_STATE_READY;
					temp = "";
				} else if(nextChar[0] == '\\') {
					this.rtfParser.handleCtrlWord(temp, this.groupLevel);
					this.state = TOKENISER_STATE_SLASH;
					temp = "";
				} else if(nextChar[0] == ' ') {
					this.rtfParser.handleCtrlWord(temp, this.groupLevel);
					this.rtfParser.handleText(" ", this.groupLevel);
					this.state = TOKENISER_STATE_READY;
					temp = "";
				} else if(nextChar[0] == ';') {
					this.rtfParser.handleCtrlWord(temp, this.groupLevel);
					this.rtfParser.handleText(";", this.groupLevel);
					this.state = TOKENISER_STATE_READY;
					temp = "";
				} else {
					temp = temp + nextChar[0];
				}
			} else if(this.state == TOKENISER_STATE_IN_TEXT) { // Text tokens are closed by control characters or words or open and close groups
				if(nextChar[0] == '{') {
					this.rtfParser.handleText(temp, this.groupLevel);
					this.rtfParser.handleOpenGroup(this.groupLevel);
					groupLevel++;
					this.state = TOKENISER_STATE_READY;
					temp = "";
				} else if(nextChar[0] == '}') {
					this.rtfParser.handleText(temp, this.groupLevel);
					this.rtfParser.handleCloseGroup(this.groupLevel);
					groupLevel--;
					this.state = TOKENISER_STATE_READY;
					temp = "";
				} else if(nextChar[0] == '\\') {
					this.state = TOKENISER_STATE_IN_TEXT | TOKENISER_STATE_SLASH;
				} else {
					temp = temp + nextChar[0];
				}
			}
		}
		if((this.state & TOKENISER_STATE_IN_TEXT) == TOKENISER_STATE_IN_TEXT && !temp.equals("")) { // If at the end a text token was being parsed, emmit that token. Required for RTF fragments
			this.rtfParser.handleText(temp, this.groupLevel);
		}
	}
}