File: CSVTokenizer.java

package info (click to toggle)
libbase 1.1.6-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 944 kB
  • sloc: java: 8,709; xml: 1,522; makefile: 18
file content (369 lines) | stat: -rw-r--r-- 12,464 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
/*
 * This program is free software; you can redistribute it and/or modify it under the
 * terms of the GNU Lesser General Public License, version 2.1 as published by the Free Software
 * Foundation.
 *
 * You should have received a copy of the GNU Lesser General Public License along with this
 * program; if not, you can obtain a copy at http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html
 * or from the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 *
 * Copyright (c) 2007 - 2009 Pentaho Corporation and Contributors.  All rights reserved.
 */

package org.pentaho.reporting.libraries.base.util;

import java.util.Enumeration;
import java.util.NoSuchElementException;

/**
 * The csv tokenizer class allows an application to break a Comma Separated Value format into tokens. The tokenization
 * method is much simpler than the one used by the <code>StringTokenizer</code> class. The <code>CSVTokenizer</code>
 * methods do not distinguish among identifiers, numbers, and quoted strings, nor do they recognize and skip comments.
 * <p/>
 * The set of separator (the characters that separate tokens) may be specified either at creation time or on a per-token
 * basis.
 * <p/>
 * An instance of <code>CSVTokenizer</code> behaves in one of two ways, depending on whether it was created with the
 * <code>returnSeparators</code> flag having the value <code>true</code> or <code>false</code>: <ul> <li>If the flag is
 * <code>false</code>, delimiter characters serve to separate tokens. A token is a maximal sequence of consecutive
 * characters that are not separator. <li>If the flag is <code>true</code>, delimiter characters are themselves
 * considered to be tokens. A token is thus either one delimiter character, or a maximal sequence of consecutive
 * characters that are not separator. </ul><p> A <tt>CSVTokenizer</tt> object internally maintains a current position
 * within the string to be tokenized. Some operations advance this current position past the characters processed.<p> A
 * token is returned by taking a substring of the string that was used to create the <tt>CSVTokenizer</tt> object.
 * <p/>
 * The following is one example of the use of the tokenizer. The code:
 * <blockquote><pre>
 *     CSVTokenizer csvt = new CSVTokenizer("this,is,a,test");
 *     while (csvt.hasMoreTokens()) {
 *         println(csvt.nextToken());
 *     }
 * </pre></blockquote>
 * <p/>
 * prints the following output:
 * <blockquote><pre>
 *     this
 *     is
 *     a
 *     test
 * </pre></blockquote>
 *
 * @author abupon
 */
public class CSVTokenizer implements Enumeration
{
  /**
   * The complete record that should be separated into elements.
   */
  private String record;
  /**
   * The separator.
   */
  private String separator;
  /**
   * The quoting char.
   */
  private String quate;

  /**
   * the current parsing position.
   */
  private int currentIndex;

  /**
   * A flag indicating that the current parse position is before the start.
   */
  private boolean beforeStart;

  /**
   * A possible separator constant.
   */
  public static final String SEPARATOR_COMMA = ",";
  /**
   * A possible separator constant.
   */
  public static final String SEPARATOR_TAB = "\t";
  /**
   * A possible separator constant.
   */
  public static final String SEPARATOR_SPACE = " ";

  /**
   * A possible quote character constant.
   */
  public static final String DOUBLE_QUATE = "\"";
  /**
   * A possible quote character constant.
   */
  public static final String SINGLE_QUATE = "'";

  /**
   * Constructs a csv tokenizer for the specified string. <code>theSeparator</code> argument is the separator for
   * separating tokens.
   * <p/>
   * If the <code>returnSeparators</code> flag is <code>true</code>, then the separator string is also returned as
   * tokens. separator is returned as a string. If the flag is <code>false</code>, the separator string is skipped and
   * only serve as separator between tokens.
   *
   * @param aString      a string to be parsed.
   * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.).
   * @param theQuate     the quate (CSVTokenizer.SINGLE_QUATE, CSVTokenizer.DOUBLE_QUATE, etc.).
   */
  public CSVTokenizer(final String aString,
                      final String theSeparator,
                      final String theQuate)
  {
    this(aString, theSeparator, theQuate, true);
  }

  /**
   * Constructs a csv tokenizer for the specified string. <code>theSeparator</code> argument is the separator for
   * separating tokens.
   * <p/>
   * If the <code>returnSeparators</code> flag is <code>true</code>, then the separator string is also returned as
   * tokens. separator is returned as a string. If the flag is <code>false</code>, the separator string is skipped and
   * only serve as separator between tokens.
   *
   * @param aString      a string to be parsed.
   * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.).
   * @param theQuate     the quate (CSVTokenizer.SINGLE_QUATE, CSVTokenizer.DOUBLE_QUATE, etc.).
   */
  public CSVTokenizer(final String aString,
                      final String theSeparator,
                      final String theQuate,
                      final boolean trim)
  {
    if (aString == null)
    {
      throw new NullPointerException("The given string is null");
    }
    if (theSeparator == null)
    {
      throw new NullPointerException("The given separator is null");
    }
    if (trim)
    {
      this.record = aString.trim();
    }
    else
    {
      this.record = aString;
    }
    this.separator = theSeparator;
    this.quate = theQuate;
    this.currentIndex = 0;
    this.beforeStart = true;
  }

  /**
   * Constructs a csv tokenizer for the specified string. The characters in the <code>theSeparator</code> argument are
   * the separator for separating tokens. Separator string themselves will not be treated as tokens.
   *
   * @param aString      a string to be parsed.
   * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.).
   */
  public CSVTokenizer(final String aString, final String theSeparator)
  {
    this(aString, theSeparator, CSVTokenizer.DOUBLE_QUATE);
  }

  /**
   * Constructs a string tokenizer for the specified string. The tokenizer uses the default separator set, which is
   * <code>CSVTokenizer.SEPARATOR_COMMA</code>. Separator string themselves will not be treated as tokens.
   *
   * @param aString a string to be parsed.
   */
  public CSVTokenizer(final String aString)
  {
    this(aString, CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.DOUBLE_QUATE, true);
  }

  /**
   * Constructs a string tokenizer for the specified string. The tokenizer uses the default separator set, which is
   * <code>CSVTokenizer.SEPARATOR_COMMA</code>. Separator string themselves will not be treated as tokens.
   *
   * @param aString a string to be parsed.
   */
  public CSVTokenizer(final String aString, final boolean trim)
  {
    this(aString, CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.DOUBLE_QUATE, trim);
  }

  /**
   * Tests if there are more tokens available from this tokenizer's string. If this method returns <tt>true</tt>, then a
   * subsequent call to <tt>nextToken</tt> with no argument will successfully return a token.
   *
   * @return <code>true</code> if and only if there is at least one token in the string after the current position;
   *         <code>false</code> otherwise.
   */
  public boolean hasMoreTokens()
  {
    return (this.currentIndex < this.record.length());
  }

  /**
   * Returns the next token from this string tokenizer.
   *
   * @return the next token from this string tokenizer.
   * @throws NoSuchElementException   if there are no more tokens in this tokenizer's string.
   * @throws IllegalArgumentException if given parameter string format was wrong
   */
  public String nextToken()
      throws NoSuchElementException, IllegalArgumentException
  {

    if (!this.hasMoreTokens())
    {
      throw new NoSuchElementException();
    }

    if (beforeStart == false)
    {
      currentIndex += this.separator.length();
    }
    else
    {
      beforeStart = false;
    }

    if (this.quate != null &&
        this.record.startsWith(this.quate, this.currentIndex))
    {
      final int quateLength = this.quate.length();
      int startOffset = this.currentIndex + quateLength;
      final StringBuffer b = new StringBuffer();
      while (true)
      {
        final int end = record.indexOf(this.quate, startOffset);
        if (end < 0)
        {
          throw new IllegalArgumentException("Illegal format");
        }

        if (record.startsWith(this.quate, end + 1) == false)
        {
          b.append (record.substring(startOffset, end));
          currentIndex = end + 1;
          return b.toString();
        }
        else
        {
          b.append (record.substring(startOffset, end + 1));
        }

        startOffset = end + quateLength * 2;
      }
    }

    final int end = this.record.indexOf(this.separator, this.currentIndex);
    if (end >= 0)
    {
      final int start = this.currentIndex;
      final String token = this.record.substring(start, end);
      this.currentIndex = end;
      return token;
    }
    else
    {
      final int start = this.currentIndex;
      final String token = this.record.substring(start);
      this.currentIndex = this.record.length();
      return token;
    }
  }

  /**
   * Returns the next token in this string tokenizer's string. First, the set of characters considered to be separator
   * by this <tt>CSVTokenizer</tt> object is changed to be the characters in the string <tt>separator</tt>. Then the
   * next token in the string after the current position is returned. The current position is advanced beyond the
   * recognized token.  The new delimiter set remains the default after this call.
   *
   * @param theSeparator the new separator.
   * @return the next token, after switching to the new delimiter set.
   * @throws NoSuchElementException
   *          if there are no more tokens in this tokenizer's string.
   */
  public String nextToken(final String theSeparator)
  {
    separator = theSeparator;
    return nextToken();
  }

  /**
   * Returns the same value as the <code>hasMoreTokens</code> method. It exists so that this class can implement the
   * <code>Enumeration</code> interface.
   *
   * @return <code>true</code> if there are more tokens; <code>false</code> otherwise.
   * @see Enumeration
   * @see CSVTokenizer#hasMoreTokens()
   */
  public boolean hasMoreElements()
  {
    return hasMoreTokens();
  }

  /**
   * Returns the same value as the <code>nextToken</code> method, except that its declared return value is
   * <code>Object</code> rather than <code>String</code>. It exists so that this class can implement the
   * <code>Enumeration</code> interface.
   *
   * @return the next token in the string.
   * @throws NoSuchElementException
   *          if there are no more tokens in this tokenizer's string.
   * @see Enumeration
   * @see CSVTokenizer#nextToken()
   */
  public Object nextElement()
  {
    return nextToken();
  }

  /**
   * Calculates the number of times that this tokenizer's <code>nextToken</code> method can be called before it
   * generates an exception. The current position is not advanced.
   *
   * @return the number of tokens remaining in the string using the current delimiter set.
   * @see CSVTokenizer#nextToken()
   */
  public int countTokens()
  {
    int count = 0;

    final int preserve = this.currentIndex;
    final boolean preserveStart = this.beforeStart;
    while (this.hasMoreTokens())
    {
      this.nextToken();
      count++;
    }
    this.currentIndex = preserve;
    this.beforeStart = preserveStart;

    return count;
  }

  /**
   * Returns the quate.
   *
   * @return char
   */
  public String getQuate()
  {
    return this.quate;
  }

  /**
   * Sets the quate.
   *
   * @param quate The quate to set
   */
  public void setQuate(final String quate)
  {
    this.quate = quate;
  }
}