1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
|
/*
* This program is free software; you can redistribute it and/or modify it under the
* terms of the GNU Lesser General Public License, version 2.1 as published by the Free Software
* Foundation.
*
* You should have received a copy of the GNU Lesser General Public License along with this
* program; if not, you can obtain a copy at http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html
* or from the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* Copyright (c) 2007 - 2009 Pentaho Corporation and Contributors. All rights reserved.
*/
package org.pentaho.reporting.libraries.base.util;
import java.util.Enumeration;
import java.util.NoSuchElementException;
/**
* The csv tokenizer class allows an application to break a Comma Separated Value format into tokens. The tokenization
* method is much simpler than the one used by the <code>StringTokenizer</code> class. The <code>CSVTokenizer</code>
* methods do not distinguish among identifiers, numbers, and quoted strings, nor do they recognize and skip comments.
* <p/>
* The set of separator (the characters that separate tokens) may be specified either at creation time or on a per-token
* basis.
* <p/>
* An instance of <code>CSVTokenizer</code> behaves in one of two ways, depending on whether it was created with the
* <code>returnSeparators</code> flag having the value <code>true</code> or <code>false</code>: <ul> <li>If the flag is
* <code>false</code>, delimiter characters serve to separate tokens. A token is a maximal sequence of consecutive
* characters that are not separator. <li>If the flag is <code>true</code>, delimiter characters are themselves
* considered to be tokens. A token is thus either one delimiter character, or a maximal sequence of consecutive
* characters that are not separator. </ul><p> A <tt>CSVTokenizer</tt> object internally maintains a current position
* within the string to be tokenized. Some operations advance this current position past the characters processed.<p> A
* token is returned by taking a substring of the string that was used to create the <tt>CSVTokenizer</tt> object.
* <p/>
* The following is one example of the use of the tokenizer. The code:
* <blockquote><pre>
* CSVTokenizer csvt = new CSVTokenizer("this,is,a,test");
* while (csvt.hasMoreTokens()) {
* println(csvt.nextToken());
* }
* </pre></blockquote>
* <p/>
* prints the following output:
* <blockquote><pre>
* this
* is
* a
* test
* </pre></blockquote>
*
* @author abupon
*/
public class CSVTokenizer implements Enumeration
{
/**
* The complete record that should be separated into elements.
*/
private String record;
/**
* The separator.
*/
private String separator;
/**
* The quoting char.
*/
private String quate;
/**
* the current parsing position.
*/
private int currentIndex;
/**
* A flag indicating that the current parse position is before the start.
*/
private boolean beforeStart;
/**
* A possible separator constant.
*/
public static final String SEPARATOR_COMMA = ",";
/**
* A possible separator constant.
*/
public static final String SEPARATOR_TAB = "\t";
/**
* A possible separator constant.
*/
public static final String SEPARATOR_SPACE = " ";
/**
* A possible quote character constant.
*/
public static final String DOUBLE_QUATE = "\"";
/**
* A possible quote character constant.
*/
public static final String SINGLE_QUATE = "'";
/**
* Constructs a csv tokenizer for the specified string. <code>theSeparator</code> argument is the separator for
* separating tokens.
* <p/>
* If the <code>returnSeparators</code> flag is <code>true</code>, then the separator string is also returned as
* tokens. separator is returned as a string. If the flag is <code>false</code>, the separator string is skipped and
* only serve as separator between tokens.
*
* @param aString a string to be parsed.
* @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.).
* @param theQuate the quate (CSVTokenizer.SINGLE_QUATE, CSVTokenizer.DOUBLE_QUATE, etc.).
*/
public CSVTokenizer(final String aString,
final String theSeparator,
final String theQuate)
{
this(aString, theSeparator, theQuate, true);
}
/**
* Constructs a csv tokenizer for the specified string. <code>theSeparator</code> argument is the separator for
* separating tokens.
* <p/>
* If the <code>returnSeparators</code> flag is <code>true</code>, then the separator string is also returned as
* tokens. separator is returned as a string. If the flag is <code>false</code>, the separator string is skipped and
* only serve as separator between tokens.
*
* @param aString a string to be parsed.
* @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.).
* @param theQuate the quate (CSVTokenizer.SINGLE_QUATE, CSVTokenizer.DOUBLE_QUATE, etc.).
*/
public CSVTokenizer(final String aString,
final String theSeparator,
final String theQuate,
final boolean trim)
{
if (aString == null)
{
throw new NullPointerException("The given string is null");
}
if (theSeparator == null)
{
throw new NullPointerException("The given separator is null");
}
if (trim)
{
this.record = aString.trim();
}
else
{
this.record = aString;
}
this.separator = theSeparator;
this.quate = theQuate;
this.currentIndex = 0;
this.beforeStart = true;
}
/**
* Constructs a csv tokenizer for the specified string. The characters in the <code>theSeparator</code> argument are
* the separator for separating tokens. Separator string themselves will not be treated as tokens.
*
* @param aString a string to be parsed.
* @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.).
*/
public CSVTokenizer(final String aString, final String theSeparator)
{
this(aString, theSeparator, CSVTokenizer.DOUBLE_QUATE);
}
/**
* Constructs a string tokenizer for the specified string. The tokenizer uses the default separator set, which is
* <code>CSVTokenizer.SEPARATOR_COMMA</code>. Separator string themselves will not be treated as tokens.
*
* @param aString a string to be parsed.
*/
public CSVTokenizer(final String aString)
{
this(aString, CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.DOUBLE_QUATE, true);
}
/**
* Constructs a string tokenizer for the specified string. The tokenizer uses the default separator set, which is
* <code>CSVTokenizer.SEPARATOR_COMMA</code>. Separator string themselves will not be treated as tokens.
*
* @param aString a string to be parsed.
*/
public CSVTokenizer(final String aString, final boolean trim)
{
this(aString, CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.DOUBLE_QUATE, trim);
}
/**
* Tests if there are more tokens available from this tokenizer's string. If this method returns <tt>true</tt>, then a
* subsequent call to <tt>nextToken</tt> with no argument will successfully return a token.
*
* @return <code>true</code> if and only if there is at least one token in the string after the current position;
* <code>false</code> otherwise.
*/
public boolean hasMoreTokens()
{
return (this.currentIndex < this.record.length());
}
/**
* Returns the next token from this string tokenizer.
*
* @return the next token from this string tokenizer.
* @throws NoSuchElementException if there are no more tokens in this tokenizer's string.
* @throws IllegalArgumentException if given parameter string format was wrong
*/
public String nextToken()
throws NoSuchElementException, IllegalArgumentException
{
if (!this.hasMoreTokens())
{
throw new NoSuchElementException();
}
if (beforeStart == false)
{
currentIndex += this.separator.length();
}
else
{
beforeStart = false;
}
if (this.quate != null &&
this.record.startsWith(this.quate, this.currentIndex))
{
final int quateLength = this.quate.length();
int startOffset = this.currentIndex + quateLength;
final StringBuffer b = new StringBuffer();
while (true)
{
final int end = record.indexOf(this.quate, startOffset);
if (end < 0)
{
throw new IllegalArgumentException("Illegal format");
}
if (record.startsWith(this.quate, end + 1) == false)
{
b.append (record.substring(startOffset, end));
currentIndex = end + 1;
return b.toString();
}
else
{
b.append (record.substring(startOffset, end + 1));
}
startOffset = end + quateLength * 2;
}
}
final int end = this.record.indexOf(this.separator, this.currentIndex);
if (end >= 0)
{
final int start = this.currentIndex;
final String token = this.record.substring(start, end);
this.currentIndex = end;
return token;
}
else
{
final int start = this.currentIndex;
final String token = this.record.substring(start);
this.currentIndex = this.record.length();
return token;
}
}
/**
* Returns the next token in this string tokenizer's string. First, the set of characters considered to be separator
* by this <tt>CSVTokenizer</tt> object is changed to be the characters in the string <tt>separator</tt>. Then the
* next token in the string after the current position is returned. The current position is advanced beyond the
* recognized token. The new delimiter set remains the default after this call.
*
* @param theSeparator the new separator.
* @return the next token, after switching to the new delimiter set.
* @throws NoSuchElementException
* if there are no more tokens in this tokenizer's string.
*/
public String nextToken(final String theSeparator)
{
separator = theSeparator;
return nextToken();
}
/**
* Returns the same value as the <code>hasMoreTokens</code> method. It exists so that this class can implement the
* <code>Enumeration</code> interface.
*
* @return <code>true</code> if there are more tokens; <code>false</code> otherwise.
* @see Enumeration
* @see CSVTokenizer#hasMoreTokens()
*/
public boolean hasMoreElements()
{
return hasMoreTokens();
}
/**
* Returns the same value as the <code>nextToken</code> method, except that its declared return value is
* <code>Object</code> rather than <code>String</code>. It exists so that this class can implement the
* <code>Enumeration</code> interface.
*
* @return the next token in the string.
* @throws NoSuchElementException
* if there are no more tokens in this tokenizer's string.
* @see Enumeration
* @see CSVTokenizer#nextToken()
*/
public Object nextElement()
{
return nextToken();
}
/**
* Calculates the number of times that this tokenizer's <code>nextToken</code> method can be called before it
* generates an exception. The current position is not advanced.
*
* @return the number of tokens remaining in the string using the current delimiter set.
* @see CSVTokenizer#nextToken()
*/
public int countTokens()
{
int count = 0;
final int preserve = this.currentIndex;
final boolean preserveStart = this.beforeStart;
while (this.hasMoreTokens())
{
this.nextToken();
count++;
}
this.currentIndex = preserve;
this.beforeStart = preserveStart;
return count;
}
/**
* Returns the quate.
*
* @return char
*/
public String getQuate()
{
return this.quate;
}
/**
* Sets the quate.
*
* @param quate The quate to set
*/
public void setQuate(final String quate)
{
this.quate = quate;
}
}
|