/* * QueryParser.java December 2002 * * Copyright (C) 2002, Niall Gallagher * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.simpleframework.http.parse; import org.simpleframework.http.Query; import org.simpleframework.util.parse.MapParser; import java.net.URLEncoder; import java.util.Set; /** * The ParameterParser is used to parse data encoded in * the application/x-www-form-urlencoded MIME type. It * is also used to parse a query string from a HTTP URL, see RFC 2616. * The parsed parameters are available through the various methods of * the org.simpleframework.http.net.Query interface. The * syntax of the parsed parameters is described below in BNF. *

 *
 *    params  = *(pair [ "&" params])
 *    pair    = name "=" value
 *    name    = *(text | escaped)
 *    value   = *(text | escaped)
 *    escaped = % HEX HEX
 *
 *

* This will consume all data found as a name or value, if the data * is a "+" character then it is replaced with a space character. * This regards only "=", "&", and "%" as having special values. * The "=" character delimits the name from the value and the "&" * delimits the name value pair. The "%" character represents the * start of an escaped sequence, which consists of two hex digits. * All escaped sequences are converted to its character value. * * @author Niall Gallagher */ public class QueryParser extends MapParser implements Query { /** * Used to accumulate the characters for the parameter name. */ private Token name; /** * Used to accumulate the characters for the parameter value. */ private Token value; /** * Constructor for the ParameterParser. This creates * an instance that can be use to parse HTML form data and URL * query strings encoded as application/x-www-form-urlencoded. * The parsed parameters are made available through the interface * org.simpleframework.util.net.Query. */ public QueryParser(){ this.name = new Token(); this.value = new Token(); } /** * Constructor for the ParameterParser. This creates * an instance that can be use to parse HTML form data and URL * query strings encoded as application/x-www-form-urlencoded. * The parsed parameters are made available through the interface * org.simpleframework.util.net.Query. * * @param text this is the text to parse for the parameters */ public QueryParser(String text){ this(); parse(text); } /** * This extracts an integer parameter for the named value. If the * named parameter does not exist this will return a zero value. * If however the parameter exists but is not in the format of a * decimal integer value then this will throw an exception. * * @param name the name of the parameter value to retrieve * * @return this returns the named parameter value as an integer */ public int getInteger(Object name) { String value = get(name); if(value != null) { return Integer.parseInt(value); } return 0; } /** * This extracts a float parameter for the named value. If the * named parameter does not exist this will return a zero value. * If however the parameter exists but is not in the format of a * floating point number then this will throw an exception. * * @param name the name of the parameter value to retrieve * * @return this returns the named parameter value as a float */ public float getFloat(Object name) { String value = get(name); if(value != null) { return Float.parseFloat(value); } return 0.0f; } /** * This extracts a boolean parameter for the named value. If the * named parameter does not exist this will return false otherwise * the value is evaluated. If it is either true or * false then those boolean values are returned. * * @param name the name of the parameter value to retrieve * * @return this returns the named parameter value as an float */ public boolean getBoolean(Object name) { Boolean flag = Boolean.FALSE; String value = get(name); if(value != null) { flag = Boolean.valueOf(value); } return flag.booleanValue(); } /** * This initializes the parser so that it can be used several * times. This clears any previous parameters extracted. This * ensures that when the next parse(String) is * invoked the status of the Query is empty. */ protected void init(){ all.clear(); map.clear(); name.len = 0; value.len = 0; off = 0; } /** * This performs the actual parsing of the parameter text. The * parameters parsed from this are taken as "name=value" pairs. * Multiple pairs within the text are separated by an "&". * This will parse and insert all parameters into a hashtable. */ protected void parse() { param(); while(skip("&")){ param(); } } /** * This method adds the name and value to a map so that the next * name and value can be collected. The name and value are added * to the map as string objects. Once added to the map the * Token objects are set to have zero length so they * can be reused to collect further values. This will add the * values to the map as an array of type string. This is done so * that if there are multiple values that they can be stored. */ private void insert(){ if(name.len > 0){ insert(name,value); } name.len = 0; value.len = 0; } /** * This will add the given name and value to the parameters map. * If any previous value of the given name has been inserted * into the map then this will overwrite that value. This is * used to ensure that the string value is inserted to the map. * * @param name this is the name of the value to be inserted * @param value this is the value of a that is to be inserted */ private void insert(Token name, Token value){ put(name.toString(), value.toString()); } /** * This is an expression that is defined by RFC 2396 it is used * in the definition of a segment expression. This is basically * a list of chars with escaped sequences. *

* This method has to ensure that no escaped chars go unchecked. * This ensures that the read offset does not go out of bounds * and consequently throw an out of bounds exception. */ private void param() { name(); if(skip("=")){ /* in case of error*/ value(); } insert(); } /** * This extracts the name of the parameter from the character * buffer. The name of a parameter is defined as a set of * chars including escape sequences. This will extract the * parameter name and buffer the chars. The name ends when a * equals character, "=", is encountered. */ private void name(){ int mark = off; int pos = off; while(off < count){ if(buf[off]=='%'){ /* escaped */ escape(); }else if(buf[off]=='=') { break; }else if(buf[off]=='+'){ buf[off] = ' '; } buf[pos++] = buf[off++]; } name.len = pos - mark; name.off = mark; } /** * This extracts a parameter value from a path segment. The * parameter value consists of a sequence of chars and some * escape sequences. The parameter value is buffered so that * the name and values can be paired. The end of the value * is determined as the end of the buffer or an ampersand. */ private void value(){ int mark = off; int pos = off; while(off < count){ if(buf[off]=='%'){ /* escaped */ escape(); }else if(buf[off]=='+'){ buf[off] = ' '; }else if(buf[off]=='&'){ break; } buf[pos++] = buf[off++]; } value.len = pos - mark; value.off = mark; } /** * This converts an encountered escaped sequence, that is all * embedded hexidecimal characters into a native UCS character * value. This does not take any characters from the stream it * just prepares the buffer with the correct byte. The escaped * sequence within the URI will be interpreded as UTF-8. *

* This will leave the next character to read from the buffer * as the character encoded from the URI. If there is a fully * valid escaped sequence, that is "%" HEX HEX. * This decodes the escaped sequence using UTF-8 encoding, all * encoded sequences should be in UCS-2 to fit in a Java char. */ private void escape() { int peek = peek(off); if(!unicode(peek)) { binary(peek); } } /** * This method determines, using a peek character, whether the * sequence of escaped characters within the URI is binary data. * If the data within the escaped sequence is binary then this * will ensure that the next character read from the URI is the * binary octet. This is used strictly for backward compatible * parsing of URI strings, binary data should never appear. * * @param peek this is the first escaped character from the URI * * @return currently this implementation always returns true */ private boolean binary(int peek) { if(off + 2 < count) { off += 2; buf[off] =bits(peek); } return true; } /** * This method determines, using a peek character, whether the * sequence of escaped characters within the URI is in UTF-8. If * a UTF-8 character can be successfully decoded from the URI it * will be the next character read from the buffer. This can * check for both UCS-2 and UCS-4 characters. However, because * the Java char can only hold UCS-2, the UCS-4 * characters will have only the low order octets stored. *

* The WWW Consortium provides a reference implementation of a * UTF-8 decoding for Java, in this the low order octets in the * UCS-4 sequence are used for the character. So, in the * absence of a defined behaviour, the W3C behaviour is assumed. * * @param peek this is the first escaped character from the URI * * @return this returns true if a UTF-8 character is decoded */ private boolean unicode(int peek) { if((peek & 0x80) == 0x00){ return unicode(peek, 0); } if((peek & 0xe0) == 0xc0){ return unicode(peek & 0x1f, 1); } if((peek & 0xf0) == 0xe0){ return unicode(peek & 0x0f, 2); } if((peek & 0xf8) == 0xf0){ return unicode(peek & 0x07, 3); } if((peek & 0xfc) == 0xf8){ return unicode(peek & 0x03, 4); } if((peek & 0xfe) == 0xfc){ return unicode(peek & 0x01, 5); } return false; } /** * This method will decode the specified amount of escaped * characters from the URI and convert them into a single Java * UCS-2 character. If there are not enough characters within * the URI then this will return false and leave the URI alone. *

* The number of characters left is determined from the first * UTF-8 octet, as specified in RFC 2279, and because this is * a URI there must that number of "%" HEX HEX * sequences left. If successful the next character read is * the UTF-8 sequence decoded into a native UCS-2 character. * * @param peek contains the bits read from the first UTF octet * @param more this specifies the number of UTF octets left * * @return this returns true if a UTF-8 character is decoded */ private boolean unicode(int peek, int more) { if(off + more * 3 >= count) { return false; } return unicode(peek,more,off); } /** * This will decode the specified amount of trailing UTF-8 bits * from the URI. The trailing bits are those following the first * UTF-8 octet, which specifies the length, in octets, of the * sequence. The trailing octets are of the form 10xxxxxx, for * each of these octets only the last six bits are valid UCS * bits. So a conversion is basically an accumulation of these. *

* If at any point during the accumulation of the UTF-8 bits * there is a parsing error, then parsing is aborted an false * is returned, as a result the URI is left unchanged. * * @param peek bytes that have been accumulated fron the URI * @param more this specifies the number of UTF octets left * @param pos this specifies the position the parsing begins * * @return this returns true if a UTF-8 character is decoded */ private boolean unicode(int peek, int more, int pos) { while(more-- > 0) { if(buf[pos] == '%'){ int next = pos + 3; int hex = peek(next); if((hex & 0xc0) == 0x80){ peek = (peek<<6)|(hex&0x3f); pos = next; continue; } } return false; } if(pos + 2 < count) { off = pos + 2; buf[off]= bits(peek); } return true; } /** * Defines behaviour for UCS-2 versus UCS-4 conversion from four * octets. The UTF-8 encoding scheme enables UCS-4 characters to * be encoded and decodeded. However, Java supports the 16-bit * UCS-2 character set, and so the 32-bit UCS-4 character set is * not compatable. This basically decides what to do with UCS-4. * * @param data up to four octets to be converted to UCS-2 format * * @return this returns a native UCS-2 character from the int */ private char bits(int data) { return (char)data; } /** * This will return the escape expression specified from the URI * as an integer value of the hexadecimal sequence. This does * not make any changes to the buffer it simply checks to see if * the characters at the position specified are an escaped set * characters of the form "%" HEX HEX, if so, then * it will convert that hexadecimal string in to an integer * value, or -1 if the expression is not hexadecimal. * * @param pos this is the position the expression starts from * * @return the integer value of the hexadecimal expression */ private int peek(int pos) { if(buf[pos] == '%'){ if(count <= pos + 2) { return -1; } char high = buf[pos + 1]; char low = buf[pos + 2]; return convert(high, low); } return -1; } /** * This will convert the two hexidecimal characters to a real * integer value, which is returned. This requires characters * within the range of 'A' to 'F' and 'a' to 'f', and also * the digits '0' to '9'. The characters encoded using the * ISO-8859-1 encoding scheme, if the characters are not with * in the range specified then this returns -1. * * @param high this is the high four bits within the integer * @param low this is the low four bits within the integer * * @return this returns the indeger value of the conversion */ private int convert(char high, char low) { int hex = 0x00; if(hex(high) && hex(low)){ if('A' <= high && high <= 'F'){ high -= 'A' - 'a'; } if(high >= 'a') { hex ^= (high-'a')+10; } else { hex ^= high -'0'; } hex <<= 4; if('A' <= low && low <= 'F') { low -= 'A' - 'a'; } if(low >= 'a') { hex ^= (low-'a')+10; } else { hex ^= low-'0'; } return hex; } return -1; } /** * This is used to determine whether a char is a hexadecimal * char or not. A hexadecimal character is considered * to be a character within the range of 0 - 9 and * between a - f and A - F. This will * return true if the character is in this range. * * @param ch this is the character which is to be determined here * * @return true if the character given has a hexadecimal value */ private boolean hex(char ch) { if(ch >= '0' && ch <= '9') { return true; } else if(ch >='a' && ch <= 'f') { return true; } else if(ch >= 'A' && ch <= 'F') { return true; } return false; } /** * This encode method will escape the text that * is provided. This is used to that the parameter pairs can * be encoded in such a way that it can be transferred over * HTTP/1.1 using the ISO-8859-1 character set. * * @param text this is the text that is to be escaped * * @return the text with % HEX HEX UTF-8 escape sequences */ private String encode(String text) { try { return URLEncoder.encode(text, "UTF-8"); }catch(Exception e){ return text; } } /** * This encode method will escape the name=value * pair provided using the UTF-8 character set. This method * will ensure that the parameters are encoded in such a way * that they can be transferred via HTTP in ISO-8859-1. * * @param name this is the name of that is to be escaped * @param value this is the value that is to be escaped * * @return the pair with % HEX HEX UTF-8 escape sequences */ private String encode(String name, String value) { return encode(name) + "=" + encode(value); } /** * This toString method is used to compose an string * in the application/x-www-form-urlencoded MIME type. * This will encode the tokens specified in the Set. * Each name=value pair acquired is converted into a UTF-8 escape * sequence so that the parameters can be sent in the IS0-8859-1 * format required via the HTTP/1.1 specification RFC 2616. * * @param set this is the set of parameters to be encoded * * @return returns a HTTP parameter encoding for the pairs */ public String toString(Set set) { Object[] list = set.toArray(); String text = ""; for(int i = 0; i < list.length; i++){ String name = list[i].toString(); String value = get(name); if(i > 0) { text += "&"; } text += encode(name, value); } return text; } /** * This toString method is used to compose an string * in the application/x-www-form-urlencoded MIME type. * This will iterate over all tokens that have been added to this * object, either during parsing, or during use of the instance. * Each name=value pair acquired is converted into a UTF-8 escape * sequence so that the parameters can be sent in the IS0-8859-1 * format required via the HTTP/1.1 specification RFC 2616. * * @return returns a HTTP parameter encoding for the pairs */ public String toString() { Set set = map.keySet(); if(map.size() > 0) { return toString(set); } return ""; } /** * This is used to mark regions within the buffer that represent * a valid token for either the name of a parameter or its value. * This is used as an alternative to the ParseBuffer * which requires memory to be allocated for storing the data * read from the buffer. This requires only two integer values. */ private class Token { /** * This represents the number of characters in the token. */ public int len; /** * This represents the start offset within the buffer. */ public int off; /** * In order to represent the Token as a value * that can be used this converts it to a String. * If the length of the token is less than or equal to zero * this will return and empty string for the value. * * @return this returns a value representing the token */ public String toString() { if(len <= 0) { return ""; } return new String(buf,off,len); } } }