/* * PathParser.java February 2001 * * Copyright (C) 2001, Niall Gallagher * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.simpleframework.http.parse; import java.io.Serializable; import org.simpleframework.http.Path; import org.simpleframework.util.parse.Parser; /** * This is used to parse a path given as part of a URI. This will read the * path, normalize it, and break it up into its components. The normalization * of the path is the conversion of the path given into it's actual path by * removing the references to the parent directories and to the current dir. *

* If the path that this represents is /usr/bin/../etc/./README * then the actual path, normalized, is /usr/etc/README. Once * the path has been normalized it is possible to acquire the segments as * an array of strings, which allows simple manipulation of the path. *

* Although RFC 2396 defines the path within a URI to have parameters this * does not extract those parameters this will simply normalize the path and * include the path parameters in the path. If the path is to be converted * into a OS specific file system path that has the parameters extracted * then the AddressParser should be used. * * @author Niall Gallagher */ public class PathParser extends Parser implements Path{ /** * Used to store the individual path segments. */ private TokenList list; /** * Used to store consumed name characters. */ private Token name; /** * Used to store consumed file extension. */ private Token ext; /** * Used to store the highest directory path. */ private Token dir; /** * Used to store consumed normalized path name. */ private Token path; /** * The default constructor will create a PathParser that * contains no specifics. The instance will return null * for all the get methods. The PathParser's get methods * may be populated by using the parse method. */ public PathParser() { this.list = new TokenList(); this.ext = new Token(); this.dir = new Token(); this.path = new Token(); this.name = new Token(); } /** * This is primarily a convineance constructor. This will parse the * String given to extract the specifics. This could be * achived by calling the default no-arg constructor and then using * the instance to invoke the parse method on that * String to extract the parts. * * @param path a String containing a path value */ public PathParser(String path){ this(); parse(path); } /** * This will parse the path in such a way that it ensures that at no * stage there are trailing back references, using path normalization. * The need to remove the back references is so that this * PathParser will create the same String * path given a set of paths that have different back references. For * example the paths /path/../path and /path * are the same path but different String's. *

* This will NOT parse an immediate back reference as this signifies * a path that cannot exist. So a path such as /../ will * result in a null for all methods. Paths such as ../bin * will not be allowed. */ protected void parse() { normalize(); path(); segments(); name(); extension(); } /** * This will initialize the parser so that it is in a ready state. * This allows the parser to be used to parse many paths. This will * clear the parse buffer objects and reset the offset to point to * the start of the char buffer. The count variable is reset by the * Parser.parse method. */ protected void init() { list.clear(); ext.clear(); dir.clear(); name.clear(); path.clear(); off = 0; } /** * This will return the extension that the file name contains. * For example a file name file.en_US.extension * will produce an extension of extension. This * will return null if the path contains no file extension. * * @return this will return the extension this path contains */ public String getExtension() { return ext.toString(); } /** * This will return the full name of the file without the path. * As regargs the definition of the path in RFC 2396 the name * would be considered the last path segment. So if the path * was /usr/README the name is README. * Also for directorys the name of the directory in the last * path segment is returned. This returns the name without any * of the path parameters. As RFC 2396 defines the path to have * path parameters after the path segments. * * @return this will return the name of the file in the path */ public String getName(){ return name.toString(); } /** * This will return the normalized path. The normalized path is * the path without any references to its parent or itself. So * if the path to be parsed is /usr/../etc/./ the * path is /etc/. If the path that this represents * is a path with an immediate back reference then this will * return null. This is the path with all its information even * the parameter information if it was defined in the path. * * @return this returns the normalize path without * ../ or ./ */ public String getPath() { return path.toString(); } /** * This will return the normalized path from the specified path * segment. This allows various path parts to be acquired in an * efficient means what does not require copy operations of the * use of substring invocations. Of particular * interest is the extraction of context based paths. This is * the path with all its information even the parameter * information if it was defined in the path. * * @param from this is the segment offset to get the path for * * @return this returns the normalize path without * ../ or ./ */ public String getPath(int from) { return list.segment(from); } /** * This will return the normalized path from the specified path * segment. This allows various path parts to be acquired in an * efficient means what does not require copy operations of the * use of substring invocations. Of particular * interest is the extraction of context based paths. This is * the path with all its information even the parameter * information if it was defined in the path. * * @param from this is the segment offset to get the path for * @param count this is the number of path segments to include * * @return this returns the normalize path without * ../ or ./ */ public String getPath(int from, int count) { return list.segment(from, count); } /** * This will return the highest directory that exists within * the path. This is used to that files within the same path * can be acquired. An example of that this would do given * the path /pub/./bin/README would be to return * the highest directory path /pub/bin/. The "/" * character will allways be the last character in the path. * * @return this method will return the highest directory */ public String getDirectory(){ return dir.toString(); } /** * This method is used to break the path into individual parts * called segments, see RFC 2396. This can be used as an easy * way to compare paths and to examine the directory tree that * the path points to. For example, if an path was broken from * the string /usr/bin/../etc then the segments * returned would be usr and etc as * the path is normalized before the segments are extracted. * * @return return all the path segments within the directory */ public String[] getSegments(){ return list.list(); } /** * This will return the path as it is relative to the issued * path. This in effect will chop the start of this path if * it's start matches the highest directory of the given path * as of getDirectory. This is useful if paths * that are relative to a specific location are required. To * illustrate what this method will do the following example * is provided. If this object represented the path string * /usr/share/rfc/rfc2396.txt and the issued * path was /usr/share/text.txt then this will * return the path string /rfc/rfc2396.txt. * * @param path the path prefix to acquire a relative path * * @return returns a path relative to the one it is given * otherwize this method will return null */ public String getRelative(String path){ return getRelative(new PathParser(path)); } /** * This is used by the getRelative(String) to * normalize the path string and determine if it contains a * highest directory which is shared with the path that is * represented by this object. If the path has leading back * references, such as ../, then the result of * this is null. The returned path begins with a '/'. * * @param path the path prefix to acquire a relative path * * @return returns a path relative to the one it is given * otherwize this method will return null */ private String getRelative(PathParser path){ char[] text = path.buf; int off = path.dir.off; int len = path.dir.len; return getRelative(text, off, len); } /** * This will return the path as it is relative to the issued * path. This in effect will chop the start of this path if * it's start matches the highest directory of the given path * as of getDirectory. This is useful if paths * that are relative to a specific location are required. To * illustrate what this method will do the following example * is provided. If this object represented the path string * /usr/share/rfc/rfc2396.txt and the issued * path was /usr/share/text.txt then this will * return the path string /rfc/rfc2396.txt. * * @param text the path prefix to acquire a relative path * @param off this is the offset within the text to read * @param len this is the number of characters in the path * * @return returns a path relative to the one it is given * otherwize this method will return null */ private String getRelative(char[] text, int off, int len){ int size = path.len - len + 1; /* '/' */ int pos = path.off + len - 1; for(int i = 0; i < len; i++){ if(text[off++] != buf[path.off+i]){ return null; } } if(pos < 0) { /* ../ */ return null; } return new String(buf,pos,size); } /** * This will extract the path of the given String * after it has been normalized. If the path can not be normalized * then the count is set to -1 and the path cannot be extracted. * When this happens then the path parameter is null. */ private void path() { if(count > 0){ path.len = count; path.off = 0; } } /** * This will simply read the characters from the end of the * buffer until it encounters the first peroid character. When * this is read it will store the file extension and remove the * characters from the buffer. */ private void extension() { int pos = off + count; /* index.html[]*/ int len = 0; while(pos-1 >= off) { /* index.htm[l]*/ if(buf[--pos]=='.'){ /* index[.]html*/ ext.off = pos+1; ext.len = len; count = pos; break; } len++; } } /** * This wil extract each individual segment from the path and * also extract the highest directory. The path segments are * basically the strings delimited by the '/' character of a * normalized path. As well as extracting the path segments * this will also extract the directory of path, that is, the * the path up to the last occurance of the '/' character. */ private void segments() { int pos = count - 1; int len = 1; if(count > 0){ if(buf[pos] == '/'){ /* /pub/bin[/] */ dir.len = pos+1; dir.off = 0; pos--; /* /pub/bi[n]/ */ } while(pos >= off){ if(buf[pos] == '/'){ /* /pub[/]bin/*/ if(dir.len == 0){ dir.len = pos+1; /* [/] is 0*/ dir.off = 0; } list.add(pos+1,len-1); len = 0; } len++; pos--; } } } /** * The normalization of the path is the conversion of the path * given into it's actual path by removing the references to * the parent directorys and to the current dir. So if the path * given was /usr/bin/../etc/./README then the actual * path, the normalized path, is /usr/etc/README. *

* This method ensures the if there are an illegal number of back * references that the path will be evaluated as empty. This can * evaluate any path configuration, this includes any references * like ../ or /.. within the path. * This will also remove empty segments like //. */ private void normalize(){ int size = count + off; int pos = off; for(off = count = 0; pos < size; pos++) { buf[count++] = buf[pos]; if(buf[pos] == '/') { if(count -1 > 0){ if(buf[count -2] == '/') /* [/]/./path/ */ count--; /* /[/]./path/ */ } } else if(buf[pos] == '.') { /* //[.]/path/ */ if(count -1 > 0) { /* /[/]./path/ */ if(buf[count - 2] !='/') /* /[/]./path./ */ continue; /* /path.[/] */ } if(pos + 2 > size){ /* /path/[.] */ count--; } else { if(buf[pos + 1] =='/'){ /* /.[/]path */ pos++;/* /[/]. */ count--; /* /.[/]path */ } if(buf[pos] !='.'){ /* /.[/]path */ continue; } if(pos + 2< size){ if(buf[pos + 2]!='/') /* /..[p]ath */ continue; /* /[.].path */ } if(count - 2 > 0) { for(count -= 2; count - 1 > 0;){ /* /path[/]..*/ if(buf[count - 1]=='/') { /* [/]path/..*/ break; } count--; } }else { /* /../ */ count = 0; off = 0; break; } pos += 2; /* /path/.[.]/ */ } } } } /** * This will extract the full name of the file without the path. * As regards the definition of the path in RFC 2396 the name * would be considered the last path segment. So if the path * was /usr/README the name is README. * Also for directorys the name of the directory in the last * path segment is returned. This returns the name without any * of the path parameters. As RFC 2396 defines the path to have * path parameters after the path segments. So the path for the * directory "/usr/bin;param=value/;param=value" would result * in the name "bin". If the path given was "/" then there will * be nothing in the buffer because extract will * have removed it. */ private void name(){ int pos = count; int len = 0; while(pos-- > off) { /* /usr/bin/;para[m] */ if(buf[pos]==';'){ /* /usr/bin/[;]param */ if(buf[pos-1]=='/'){ /* /usr/bin[/];param */ pos--; /* /usr/bin[/];param */ } len = 0; /* /usr/bin[/]*/ }else if(buf[pos]=='/'){ /* /usr[/]bin*/ off = pos + 1; /* /usr/[b]in*/ count = len; /* [b]in */ break; }else{ len++; } } name.len = count; name.off = off; } /** * This will return the normalized path. The normalized path is * the path without any references to its parent or itself. So * if the path to be parsed is /usr/../etc/./ the * path is /etc/. If the path that this represents * is a path with an immediate back reference then this will * return null. This is the path with all its information even * the parameter information if it was defined in the path. * * @return this returns the normalize path without * ../ or ./ */ public String toString(){ return getPath(); } /** * This is used so that the PathParser can speed * up the parsing of the data. Rather than using a buffer like * a ParseBuffer or worse a StringBuffer * this just keeps an index into the character array from the * start and end of the token. Also this enables a cache to be * kept so that a String does not need to be made * again after the first time it is created. */ private class Token implements Serializable { /** * Provides a quick retrieval of the token value. */ public String value; /** * Offset within the buffer that the token starts. */ public int off; /** * Length of the region that the token consumes. */ public int len; /** * If the Token is to be reused this will clear * all previous data. Clearing the buffer allows it to be * reused if there is a new URI to be parsed. This ensures * that a null is returned if the token length is zero. */ public void clear() { value = null; len = 0; } /** * This method will convert the Token into it's * String equivelant. This will firstly check * to see if there is a value, for the string representation, * if there is the value is returned, otherwise the region * is converted into a String and returned. * * @return this returns a value representing the token */ public String toString() { if(value != null) { return value; } if(len > 0) { value = new String(buf,off,len); } return value; } } /** * The TokenList class is used to store a list of * tokens. This provides an add method which can * be used to store an offset and length of a token within * the buffer. Once the tokens have been added to they can be * examined, in the order they were added, using the provided * list method. This has a scalable capacity. */ private class TokenList implements Serializable { /** * This is used to cache the segments that are created. */ private String[] cache; /** * Contains the offsets and lengths of the tokens. */ private int[] list; /** * Determines the write offset into the array. */ private int count; /** * Constructor for the TokenList is used to * create a scalable list to store tokens. The initial * list is created with an array of sixteen ints, which * is enough to store eight tokens. */ private TokenList(){ list = new int[16]; } /** * This is used to acquire the path from the segment that * is specified. This provides an efficient means to get * the path without having to perform expensive copy of * substring operations. * * @param from this is the path segment to get the path * * @return the string that is the path segment created */ public String segment(int from) { int total = count / 2; int left = total - from; return segment(from, left); } /** * This is used to acquire the path from the segment that * is specified. This provides an efficient means to get * the path without having to perform expensive copy of * substring operations. * * @param from this is the path segment to get the path * @param total this is the number of segments to use * * @return the string that is the path segment created */ public String segment(int from, int total) { int last = list[0] + list[1] + 1; if(from + total < count / 2) { last = offset(from + total); } int start = offset(from); int length = last - start; return new String(buf, start-1, length); } /** * This is used to acquire the offset within the buffer * of the specified segment. This allows a path to be * created that is constructed from a given segment. * * @param segment this is the segment offset to use * * @return this returns the offset start for the segment */ private int offset(int segment) { int last = count - 2; int shift = segment * 2; int index = last - shift; return list[index]; } /** * This is used to add a new token to the list. Tokens * will be available from the list method in * the order it was added, so the first to be added will * at index zero and the last with be in the last index. * * @param off this is the read offset within the buffer * @param len the number of characters within the token */ public void add(int off, int len){ if(count+1 > list.length) { resize(count *2); } list[count++] = off; list[count++] = len; } /** * This is used to retrieve the list of tokens inserted * to this list using the add method. The * indexes of the tokens represents the order that the * tokens were added to the list. * * @return returns an ordered list of token strings */ public String[] list(){ if(cache == null) { cache = build(); } return cache; } /** * This is used to retrieve the list of tokens inserted * to this list using the add method. The * indexes of the tokens represents the order that the * tokens were added to the list. * * @return returns an ordered list of token strings */ private String[] build(){ String[] value = new String[count/2]; for(int i =0, j = count/2; i< count; i+=2){ int index = j - (i/2) - 1; int off = list[i]; int size = list[i + 1]; value[index] = new String(buf, off, size); } return value; } /** * This is used to clear all tokens previously stored * in the list. This is required so that initialization * of the parser with the init method can * ensure that there are no tokens from previous data. */ public void clear(){ cache =null; count =0; } /** * Scales the internal array used should the number of * tokens exceed the initial capacity. This will just * copy across the ints used to represent the token. * * @param size length the capacity is to increase to */ private void resize(int size){ int[] copy = new int[size]; System.arraycopy(list,0,copy,0,count); list = copy; } } }