File: ElementTree.k

package info (click to toggle)
kaya 0.2.0-6
links: PTS
area: main
in suites: etch, etch-m68k
size: 3,012 kB
ctags: 1,307
sloc: cpp: 6,691; haskell: 4,833; sh: 2,868; yacc: 768; makefile: 700; perl: 87
file content (705 lines) | stat: -rw-r--r-- 22,989 bytes
/** -*-C-*-ish
    ElementTree.k Copyright (C) 2005 Chris Morris

    This file is distributed under the terms of the GNU Lesser General
    Public Licence. See COPYING for licence.
*/

module ElementTree;

import Prelude;
import Regex;
import Strings;
import Dict;
import public ElementTreeData;
import XMLentities;

data StartTagType = BlackListed | Empty(ElementTree e)
     | ImplicitCloser | NormalOpen(ElementTree o);

public Exception InputError(String s) = Exception("Error converting from String: "+s,550);
public Exception UnexpectedEnd(String p) = InputError("Unexpected end of String at "+p);
public Exception ExpectedChar(Char c, String p) = InputError("Expected "+c+" at "+p);
public Exception UnexpectedChar(Char c, String p) = InputError("Unexpected "+c+" at "+p);
public Exception NotLazy = Exception("Tried to evaluate something that wasn't a generator.",551);


"DEPRECATED synonym for string()"
public String toString(var ElementTree etree, var EmptyTagMode mode = OpenAndClose, Int indentlevel = 0, var UnicodeFormat uform = LiteralUTF8, var ExistDict<String> breakers=newExist(1,strHash), var ExistDict<String> emptyels=newExist(1,strHash)) = string(etree,mode,indentlevel,uform,breakers,emptyels);

"Returns the string representation of an ElementTree, for final printing"
public String string(ElementTree etree, EmptyTagMode mode = OpenAndClose, Int indentlevel = 0, UnicodeFormat uform = LiteralUTF8, ExistDict<String> breakers=newExist(1,strHash), ExistDict<String> emptyels=newExist(1,strHash)) {
  output = "";
  indents = makeIndents();
  void(doToString(output,etree,mode,indentlevel,uform,breakers,emptyels,indents));
  return output;
}

// false for nobreak, true otherwise
Bool format(ExistDict<String> formats, String name) = exists(formats,name);

// recursively does things with the string
Bool doToString(var String output, ElementTree etree, EmptyTagMode mode = OpenAndClose, Int indentlevel = 0, UnicodeFormat uform = LiteralUTF8, ExistDict<String> breakers=newExist(1,strHash), ExistDict<String> emptyels=newExist(1,strHash), var [String] indents) {
  elformat = format(breakers,etree.name);
  if (elformat) {
    output += getIndent(indents,indentlevel);
  }
  if (size(etree.elements) > 0 || mode == OpenAndClose) {
    output += startTag(etree,uform);
    lastse = false;
    for element in etree.elements {
      case element of {
	// either it's a sub-element
	SubElement(nested) -> lastse = doToString(output,nested,mode,indentlevel+1,uform,breakers,emptyels,indents);
	// or it's text (which can't contain tags...)
	| CData(cdata) -> lastse = false;
	  // don't need to encode quotes here.
	  output += simpleEncode(cdata,true,uform);
	  // or it's for lazy evaluation and will be a sub-element
	| SubTree(generator) -> lastse = lazyString(output,@generator,mode,indentlevel,uform,breakers,emptyels,indents); 
      }
    }
    if (elformat && lastse) {
      output += getIndent(indents,indentlevel);
    }
    output += endTag(etree);
  } else {
    // if it contains nothing, and the parse mode is appropriate,
    // assume it's an empty tag.
    if (exists(emptyels,etree.name)) {
      output += singletonTag(etree,mode,uform);
    } else {
      // then we need to explicitly open and close it
      output += startTag(etree,uform)+endTag(etree);      
    }
  }
  //  gc();
  return elformat;
}

Bool lazyString(var String output, ElementTree() generator, EmptyTagMode mode = OpenAndClose, Int indentlevel = 0, UnicodeFormat uform = LiteralUTF8, ExistDict<String> breakers=newExist(1,strHash), ExistDict<String> emptyels=newExist(1,strHash), var [String] indents) {
  return doToString(output,generator(),mode,indentlevel+1,uform,breakers,emptyels,indents);
}

"Sends an ElementTree directly to stdout - more memory-efficient than
string() but less flexible."
public Void lazyPrint(ElementTree etree, EmptyTagMode mode = OpenAndClose, Int indentlevel = 0, UnicodeFormat uform = LiteralUTF8, ExistDict<String> breakers=newExist(1,strHash), ExistDict<String> emptyels=newExist(1,strHash)) {
  indents = makeIndents();
  void(doLazyPrint(etree,mode,indentlevel,uform,breakers,emptyels,indents));
}

Bool doLazyPrint(ElementTree etree, EmptyTagMode mode = OpenAndClose, Int indentlevel = 0, UnicodeFormat uform = LiteralUTF8, ExistDict<String> breakers=newExist(1,strHash), ExistDict<String> emptyels=newExist(1,strHash), [String] indents) {
  elformat = format(breakers,etree.name);
  if (elformat) {
    putStr(getIndent(indents,indentlevel));
  }
  if (size(etree.elements) > 0) {
    putStr(startTag(etree,uform));
    lastse = false;
    if (elformat) {
      ninlevel = indentlevel+1;
    } else {
      ninlevel = indentlevel;
    }
    for element in etree.elements {
      case element of {
	SubElement(nested) -> lastse = doLazyPrint(nested,mode,ninlevel,uform,breakers,emptyels,indents);
	| CData(cdata) -> lastse = false;
	putStr(simpleEncode(cdata,true,uform));
	| SubTree(generator) -> lastse = doLazyPrint(generator(),mode,ninlevel,uform,breakers,emptyels,indents); //gc();
      }
    }
    
    if (elformat && lastse) {
      putStr(getIndent(indents,indentlevel));
    }
    putStr(endTag(etree));
  } else {
    if (exists(emptyels,etree.name) && mode != OpenAndClose) {
      putStr(singletonTag(etree,mode,uform));
    } else {
      putStr(startTag(etree,uform));
      putStr(endTag(etree));      
    }
  }
  return elformat;
}

// this looks a bit silly, but it's more efficient than calling rep()
// every time.
[String] makeIndents() = ["\n",
			  "\n ",
			  "\n  ",
			  "\n   ",
			  "\n    ",
			  "\n     ",
			  "\n      ",
			  "\n       ",
			  "\n        "]; // have some in to start with

String getIndent(var [String] indents, Int level) {
  i = indents[level];
  if (isInitialised(i)) {
    return i;
  }
  // because it can only increase one at a time
  i = indents[level-1]+" ";
  return i;
}

private String startTag(ElementTree etree, UnicodeFormat uform) = openTag(etree,uform)+">";

private String endTag(ElementTree etree) = "</"+etree.name+">";

private String singletonTag(ElementTree etree, EmptyTagMode mode, UnicodeFormat uform) {
  output = openTag(etree,uform);
  case mode of {
    ImpliedSingleton -> output += ">";
    | default -> output += " />";
  }
  return output;
}

private String openTag(ElementTree etree, UnicodeFormat uform) {
  // this will need modification if we ever start on XML namespaces
  output = "<"+etree.name;

  // only more efficient for tinyDict
  names = keys(etree.attributes);
  values = vals(etree.attributes);
  for i in [0..size(names)-1] {
    /*    if (quickMatch("[^A-Za-z]",key) || key == "") {
      //      throw(ElementParseError);
      } */
    output += " " + names[i] + "=\"" + simpleEncode(values[i],false,uform) + "\"";
  }
  return output;
}

"Encode a String to escape <>& assuming all ampersands need escaping. Quotes will be escaped by default but this can be turned off outside quoted strings."
public String simpleEncode(String invalue, Bool leavequotes=false, UnicodeFormat uform=LiteralUTF8) {
  if (invalue == "" || (!elem('&',invalue) &&
			(leavequotes || !elem('"',invalue)) &&
			!elem('<',invalue))) { return invalue; }
  value = copy(invalue);
  // here we rely on the incoming code being literal UTF8. It should be.
  replace("&","&amp;",value,[Global]);
  replace("<","&lt;",value,[Global]);
  //  replace(">","&gt;",value,[Global]); // unnecessary
  if (!leavequotes) {
    replace("\"","&quot;",value,[Global]);
  }
  // don't need to escape ' because we always use " to quote values
  case uform of {
    LiteralUTF8 -> return value;
    | NumericReference -> return literalToEntity(value);
  }
}

"Creates a new Element. 
If the number of child elements and data blocks is known in advance, then
there is a slight efficiency gain from specifying expectedchildren"
public ElementTree newElement(String name, Int expectedchildren=0) {
  // use Dict::newTiny because most elements have few attributes.
  if (expectedchildren > 0) {
    return ElementTree(createArray(expectedchildren),name,Dict::newTiny);
  } else {
    // 10 is probably a reasonable guess for most elements.
    return ElementTree(createArray(10),name,Dict::newTiny);
  }
}

public Int attrHash(String tohash) = Int(head(tohash));

"Adds an ElementTree to the end of an element's children"
public Void pushElement(ElementTree parent, ElementTree child) {
  push(parent.elements,SubElement(child));
}

"Adds an ElementTree to the start of an element's children"
public Void unshiftElement(ElementTree parent, ElementTree child) {
  unshift(SubElement(child),parent.elements);
}

"This inserts an element into the tree at the specified position"
public Void addElementAt(ElementTree parent, ElementTree child, Int index) {
  addAt(parent.elements,SubElement(child),index);
}

"Adds an ElementTree to the end of an element's children (lazy evaluation)"
public Void pushGenerator(ElementTree parent, ElementTree() child) {
  push(parent.elements,SubTree(@child));
}

"Adds an ElementTree to the start of an element's children (lazy evaluation)"
public Void unshiftGenerator(ElementTree parent, ElementTree() child) {
  unshift(SubTree(@child),parent.elements);
}

"This inserts an element into the tree at the specified position (lazy evaluation)"
public Void addGeneratorAt(ElementTree parent, ElementTree() child, Int index) {
  addAt(parent.elements,SubTree(@child),index);
}

"This will merge if the last element is already CData"
public Void pushData(ElementTree parent, String cdata) {
  if (size(parent.elements) == 0) {
    push(parent.elements,CData(cdata));
  } else {
    case parent.elements[size(parent.elements)-1] of {
      CData(text) -> parent.elements[size(parent.elements)-1].cdata += cdata;
      | default -> push(parent.elements,CData(cdata));
    }
  }
}

"This will merge if the first element is already CData"
public Void unshiftData(ElementTree parent, String cdata) {
  if (size(parent.elements) == 0) {
    unshift(CData(cdata),parent.elements);
  } else {
    case parent.elements[0] of {
      CData(text) -> parent.elements[0].cdata = cdata + parent.elements[0].cdata;
      | default -> unshift(CData(cdata),parent.elements);
    }
  }
}

"This inserts data into the tree at the specified position. This will merge if the last element is already CData"
public Void addDataAt(ElementTree parent, String cdata, Int index) {
  if (index == 0) {
    unshiftData(parent,cdata);
  } else {
    case parent.elements[index-1] of {
      CData(text) -> parent.elements[index-1].cdata += cdata;
      | default -> addAt(parent.elements,CData(cdata),index);
    }
  }
}

"Evaluates the lazy generator at index <var>idx</var> and replaces it with
the results"
public Void evaluateGenerator(ElementTree element, Int idx) {
  case element.elements[idx] of {
    SubTree(gen) -> element.elements[idx] = SubElement(gen());
    | default -> throw(NotLazy);
  }
}

public Void setAttribute(ElementTree element, String name, String value) {
  add(element.attributes,name,value);
}

public Void unsetAttribute(ElementTree element, String name) {
  delete(element.attributes,name);
}

public Int textSizeOfBlock(Element element) {
  case element of {
    CData(text) -> return length(text);
    | SubElement(el) -> len = 0;
    for elem in el.elements {
      len += textSizeOfBlock(elem);
    }
    return len;
    | SubTree(gen) -> el = gen(); // this can be really inefficient.
    for elem in el.elements {
      len += textSizeOfBlock(elem);
    }
    return len;
  }
}

public [Element] getElements(ElementTree block) {
  return block.elements;
}

public String getName(ElementTree block) {
  return block.name;
}

public TinyDict<String,String> getAttributes(ElementTree block) {
  return block.attributes;
}

public Maybe<String> getAttribute(ElementTree block, String attribute) {
  return lookup(block.attributes,attribute);
}

"Finds the first occurrence of a particular element directly within another element"
public Maybe<Int> findElement(ElementTree block, String subname) {
  i=0;
  for e in block.elements {
    case e of {
      SubElement(el) -> if (el.name == subname) { return just(i); }
      | SubTree(gen) -> el = gen(); if (el.name == subname) { return just(i); }
      | default -> ;
    }
    i++;
  }
  return nothing;
}

/* Start of String->ElementTree conversion functions */

"Converts a String to an ElementTree. Several additional parameters may
be passed to assist in conversion and to whitelist certain elements
and/or attributes."
public ElementTree elementTree(String original,
			       String rootelement="tree",
			       Dict<String,[String]> whitelist=Dict::new(19,strHash),
			       Dict<String,[String]> implicitend=Dict::new(19,strHash),
			       [String] empty=createArray(1)) {

  tree = newElement(rootelement);
  pos = 0;
  
  parseString(original,tree,pos,whitelist,implicitend,empty);
  if (pos < length(original)) {
    throw(InputError("Unexpectedly reached end at "+pos+" before actual end of string."));
  }
  return tree;
}

// go along collecting until we get to a tag start
// then parse the tag and recursively call on that tag
// or if it's an end tag, check it matches
// or check it's in implicitend
// return if it's a matching end tag
// if end of string reached, quit
// throw exceptions if something goes wrong
Void parseString(String original, ElementTree tree, var Int pos, Dict<String,[String]> whitelist, Dict<String,[String]> implicitend, [String] empty) {
  
  str = "";
  len = length(original);
  while (pos < len) {
    skip = firstOccurs('<',substr(original,pos,len-pos));
    if (skip > 0) {
      str += substr(original,pos,skip);
      pos += skip;
    }
    if (pos < len) { // c=='<' if we've not reached the end of the string
      if (pos+1 < len) {
	if (getIndex(original,pos+1) == '/') {
	  if (parseEndTag(original,tree.name,pos,implicitend,whitelist)) {
	    if (str != "") {
	      pushData(tree,str);
	    }
	    return;
	  }
	} else if (getIndex(original,pos+1) == '!') {
	  if (pos+3 < len) {
	    if ((getIndex(original,pos+2) == '-') && (getIndex(original,pos+3) == '-')) {
	      pos += 4;
	      skipComment(original,pos);
	    } else {
	      skipTag(original,pos);
	    }
	  } else {
	    throw(UnexpectedEnd(getLP(original,pos+1)));
	  }
	} else {
	  if (str != "") {
	    pushData(tree,str);
	  }
	  str = "";
	  case analyseStartTag(original,tree.name,pos,whitelist,implicitend,empty) of {
	    BlackListed -> skipTag(original,pos);
	    | Empty(newel) -> pushElement(tree,newel); pos--;
	    | ImplicitCloser -> pos--; return; // don't move on by pos in this case
	    | NormalOpen(newel) -> pushElement(tree,newel);
	    parseString(original,newel,pos,whitelist,implicitend,empty);
	  }
	}
      } else {
	throw(UnexpectedEnd(getLP(original,pos)));
      }
    }
    pos++;
  }
  if (str != "") {
    pushData(tree,str); //only happens on root of stack
  }
}

// find out what sort of end tag this is.
// if it matches the current element, return.
// if it matches an implicit end for the current element, RESET pos
// and return
// otherwise exception.
Bool parseEndTag(String original, String currentname, var Int pos, Dict<String,[String]> implicitend, Dict<String,[String]> whitelist) {
  // pos currently points at the '<'
  ipos = pos+1;
  len = length(original);
  if (ipos >= len) {
    throw(UnexpectedEnd(getLP(original,ipos)));
  } else if (getIndex(original,ipos) != '/') {
    throw(ExpectedChar('/',getLP(original,ipos)));
  }
  ipos++; // now points just after </
  tagname = getTagName(original,ipos);
  if (tagname == "") {
    throw(UnexpectedChar(getIndex(original,ipos),getLP(original,ipos)));
  } 
  while(isWhitespace(getIndex(original,ipos))) {
    ipos++;
    if (ipos >= len) {
      throw(UnexpectedEnd(getLP(original,ipos)));
    }
  }
  if (getIndex(original,ipos) != '>') {
    throw(UnexpectedChar(getIndex(original,ipos),getLP(original,ipos)));
  }
  //  ipos++; // ipos now points just past the end of the close tag.
  if (tagname == currentname) {
    // simplest case, tag is closing the current element
    pos = ipos; // update pos
    return true;
  } else {
    iclosers = lookup(implicitend,currentname);
    case iclosers of {
      nothing -> ;
      | just(icloser) -> ctag = "/"+tagname;
      if (elem(ctag,icloser)) {
	// implicit closing. Don't update pos from ipos
	pos--; // but do decrement it
	return true;
      }
    }
    if (empty(whitelist) || exists(whitelist,tagname) || exists(whitelist,"*")) {
      throw(InputError("Unexpected closing tag "+tagname+" at "+getLP(original,pos)));
    } else {
      // it's a blacklisted tag, so assume we skipped an earlier start tag.
      pos = ipos; // we do update pos here.
      return false; // but don't drop a level
    }
  }
}

String getTagName(String original, var Int ipos, Char closer= '>') {
  tagname = "";
  len = length(original);
  while (ipos < len && getIndex(original,ipos) != closer && !isWhitespace(getIndex(original,ipos))) {
    c = getIndex(original,ipos);
    if (isAlphanumeric(c) || c == '.' || c == '_' || c == '-') {
      if (tagname == "") {
	// harsher restrictions on first char
	if (isAlphabetical(c) || c == '_') {
	  tagname += String(c);
	} else {
	  throw(UnexpectedChar(c,getLP(original,ipos)));
	}
      } else {
	tagname += String(c);
      }
    } else {
      throw(UnexpectedChar(c,getLP(original,ipos)));
    }
    ipos++;
  }
  return toLowercase(tagname);
}

StartTagType analyseStartTag(String original, String currentname, var Int pos, Dict<String,[String]> whitelist, Dict<String,[String]> implicitend, [String] emptyels) {
  // pos currently points at the '<' 
  ipos = pos+1; // in case we get an implicit closer or a blacklist
  tagname = getTagName(original,ipos);
  if (tagname == "") {
    throw(UnexpectedChar(getIndex(original,ipos),getLP(original,ipos)));
  }
  if (!empty(whitelist) && !exists(whitelist,tagname) && !exists(whitelist,"*")) {
    return BlackListed;
  }
  case lookup(implicitend,currentname) of {
    nothing -> ;
    | just(enders) -> if (elem(tagname,enders)) {
      // implicit ending of current element, so need to end it before we start a new one
      return ImplicitCloser;
    }
  }
  // now ipos is just after the <elname
  if (elem(tagname,emptyels)) {
    newel = newElement(tagname);
    rv = Empty(newel);
  } else {
    newel = newElement(tagname);
    rv = NormalOpen(newel);
  }
  parseAttributes(original,ipos,newel,whitelist);
  if (getIndex(original,ipos) == '/') {
    case rv of {
      Empty(n) -> ipos++; // skip it, we know
      | default -> throw(InputError("Unexpected closing of non-empty element."));
    }
  }
  len = length(original);
  if (ipos >= len) {
    throw(UnexpectedEnd(getLP(original,ipos)));
  }
  while(isWhitespace(getIndex(original,ipos))) {
    ipos++;
    if (ipos >= len) {
      throw(UnexpectedEnd(getLP(original,ipos)));
    }
  }
  if (getIndex(original,ipos) != '>') {
    throw(UnexpectedChar(getIndex(original,ipos),getLP(original,ipos)));
  }
  ipos++; // ipos now points just past the end of the close tag.
  pos = ipos;
  return rv;
}

Void parseAttributes(String original, var Int ipos, ElementTree el, Dict<String,[String]> whitelist) {
  len = length(original);
  while (ipos < len && getIndex(original,ipos) != '/' && getIndex(original,ipos) != '>') {
    // get through any whitespace separating attribs
    if (!isWhitespace(getIndex(original,ipos))) {
      parseAttribute(original,ipos,el,whitelist,len);
    }
    ipos++;
  }
  if (ipos >= len) {
    throw(UnexpectedEnd(getLP(original,ipos)));
  }
}

Void parseAttribute(String original, var Int ipos, ElementTree el, Dict<String,[String]> whitelist, Int len) {
  // ipos is just after the whitespace
  attrname = getTagName(original,ipos,'=');
  // ipos is on '=', in theory
  if (isWhitespace(getIndex(original,ipos))) {
    // unless it's a minimised attribute
    attrval = attrname; //actually, name = val, here.
  } else {
    if (attrname == "" || getIndex(original,ipos) != '=') {
      throw(UnexpectedChar(getIndex(original,ipos),getLP(original,ipos)));    
    } 
    ipos++; // now on start of attribute value
    if (ipos >= len) {
      throw(UnexpectedEnd(getLP(original,ipos)));
    }
    c = getIndex(original,ipos);
    if (c == '"') {
      ipos++;
      attrval = getAttVal(original,ipos,'"',len);
    } else if (c == '\'') {
      ipos++;
      attrval = getAttVal(original,ipos,'\'',len);
    } else {
      attrval = getAttVal(original,ipos,Char(0),len);
    }
  }

  if (!empty(whitelist)) {
    allowed = deref(lookup(whitelist,el.name));
    // if it gets this far, deref is safe.
    if (!elem(attrname,allowed) && !elem("*",allowed)) {
      return;
    }
  } 
  setAttribute(el,attrname,attrval);
}

String getAttVal(String original, var Int ipos, Char ender, Int len) {
  if (ipos >= len) {
    throw(UnexpectedEnd(getLP(original,ipos)));
  }
  attrval = "";
  if (ender == Char(0)) {
    while (ipos < len && (isAlphanumeric(getIndex(original,ipos)))) {
      attrval += String(getIndex(original,ipos));
      ipos++;
    }
  } else {
    sz = firstOccurs(ender,substr(original,ipos,len-ipos));
    if (sz > 0) {
      attrval = substr(original,ipos,sz);
    }
    ipos+=sz;
  }
  if (ipos >= len) {
    throw(UnexpectedEnd(getLP(original,ipos)));
  }
  c= getIndex(original,ipos);
  if (ender == Char(0)) {
    if (!isWhitespace(c) && c != '>') {
      throw(UnexpectedChar(c,getLP(original,ipos)));
    }
    ipos--; // we reincrement later
  } 
  return attrval;
}

Void skipTag(String original, var Int pos) {
  // currently at start of tag to skip, need to move to just after end of this tag.
  sq = false;
  dq = false;
  ended = false;
  len = length(original);
  while (pos < len && !ended) {
    c = getIndex(original,pos);
    if (sq) {
      if (c == '\'') {
	sq = false;
      }
    } else if (dq) {
      if (c == '"') {
	dq = false;
      }
    } else {
      if (c == '>') {
	ended = true;
      }
    }
    if (!ended) {
      pos++;
    }
  }
  if (!ended) {
    throw(UnexpectedEnd(getLP(original,pos)));
  }
  // now pos points just after end tag
}

Void skipComment(String original, var Int pos) {
  cdash1 = false;
  cdash2 = false;
  len = length(original);
  while (pos < len) {
    if (cdash2) {
      if (getIndex(original,pos) == '>') {
	return;
      }
    } else if (!cdash1) {
      if (getIndex(original,pos) == '-') {
	cdash1 = true;
      }
    } else {
      if (getIndex(original,pos) == '-') {
	cdash2 = true;
      } else {
	cdash1 = false;
      }
    }
    pos++;
  }
  throw(UnexpectedEnd(getLP(original,pos)));
}

String getLP(String original, Int pos) {
  line = 1;
  col = 1;
  for i in [0..pos-1] {
    if (getIndex(original,i) == '\n') {
      line++;
      col = 1;
    } else if (getIndex(original,i) == '\r' && (i+1 == pos || getIndex(original,i) != '\n')) {
      line++;
      col = 1;
    } else {
      col++;
    }
  }
  return "Line "+line+", Column "+col;
}