#!/usr/bin/env perl

# Flex(1) XML processor scanner generator.
# Copyright © 1999 Kristoffer Rose.  All rights reserved.
#
# This file is part of the FleXML XML processor generator system.
# Copyright © 1999 Kristoffer Rose.  All rights reserved.
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc., 59
# Temple Place, Suite 330, Boston, MA 02111-1307 USA.

use warnings;

my $Id = '$Id: flexml.pl,v 1.44 2005/02/23 23:08:16 mquinson Exp $ ';
$Id =~ s/\s*\$\s*//go;

# IMPORTS.

use Getopt::Std;
use LWP::UserAgent;
use Carp qw/cluck/;

use strict;

# FILES (must be global).

use vars qw/ $SKELETON /;	# the skeleton scanner
use vars qw/ $SCANNER /;	# generated XML processor (+ application if requested)
use vars qw/ $HEADER /;		# generated XML processor API header
use vars qw/ $ACTIONS /;	# the actions file
use vars qw/ $APPLICATION /;	# generated XML application

# OPTIONS (and other globals).

my $Use;			# usage string
my %opt = ();			# options

my $debug;			# -d option flag
my $verbose;			# -v option flag
my $lineno;			# -L option flag
my $nofail;			# -X option flag
my $quiet_parser;		# -q option flag
my $uri;			# -u option uri
my $pubid;			# -p option string
my $stacksize;			# -b option flag

my $dtd;			# DTD file name (or URI)
my $dtdrevision;		# DTD version pruned from file
my $cdtd;			# C identifier derived from DTD name

my @inputs = ();		# input data stack (array of arrays of lines)
my @inputnames = ();		# names of files on input data stack
my @inputlinenos = ();		# line number in input files

# DTD regular expressions (extended to not mess up the counting).

my $NameChar = '[A-Za-z0-9.:_-]';
my $Name     = "[A-Za-z_:]$NameChar*";
my $Names    = "$Name(?:\\s+$Name)*";
my $Nmtoken  = "$NameChar+";
my $Nmtokens = "$Nmtoken(?:\\s+$Nmtoken)*";
my $Eq       = '\s*=\s*';
my $Literal  = '(?:\'[^\']*\'|"[^"]*")';

# DTD DESCRIPTION STRUCTURES.

my %source = ();	# DTD source lines associated to tag and tag/attribute.

my @tags = ();		# Tags of the DTD.
my %roottags = ();	# Tags that may be the root tag.
my %ctag = ();		# C variable name of each tag.

my %states = ();	# $states{tag} is list of states used by tag element.
my %emptytrans = ();	# $emptytrans{state} contains empty transitions in automaton.

my %instates = ();	# $instates{tag} is list of states for element start/empty tag.
my %startstate = ();	# $startstate{tag} is the state entered after the start tag.
my %endstates = ();	# $endstates{tag} is list of states for element end tag.
my %exittrans = ();	# $exittrans{tag} is state transitions after end tag.

my %empty = ();		# $empty{tag} == true if the tag may be empty.
my %properempty = ();	# $empty{tag} == true if the tag is declared EMPTY
my %any = ();		# $any{tag} == true if the tag has ANY contents.
my %mixed = ();		# $mixed{tag} == true if the tag has Mixed (or ANY) contents.
my %children = ();	# $children{tag} == true if the tag has Element contents

my %inmixed = ();	# $inmixed{tag} == true if the tag occurs *in* Mixed contents.

my @attributes = ();	# Attributes of the DTD.
my %catt = ();		# C variable name for attribute.

my %atttype = ();	# XML AttType (type) of tag/attribute
my %enumtype = ();	# whether the tag/attribute is an enumeration type
my %literaltype = ();	# whether the tag/attribute is a tokenized type
my %entitytype = ();	# whether the tag/attribute is of entity type
my %typeof = ();	# C type of tag/attribute

my %attdef = ();	# XML AttDef (default value) of tag/attribute
my %required = ();	# true if tag/attribute is required
my %fixed = ();		# true if tag/attribute has fixed default
my %initof = ();	# C initial attribute value of tag/attribute, if any

my %attlist = ();	# $attlist{tag} is comma-separated list of attribute
			# names allowed in tag elements.
my %withattr = ();	# $withattr{attribute} is comma-separated list of
			# elements within which the tag element may occur.

my %entity = ();	# general entity table (C strings)
my %parameter = ();	# parameter entity table (raw string)
my %external = ();      # external entity table (uris)

my %startok = ();	# start tag action already dumped
my %endok = ();		# end tag action already dumped


# UTILITIES.


sub printsource { # Print source lines of argument.
  my ($key) = @_;
  local $_ = $source{$key};
  return if not $_;

  s:[*][/]:* /:g;		# avoid */ in output [sic]
  s/\n/\n  * /mg;
  print "\n /* " . $_ . "  */\n";
}


sub cquote { # Convert a string to C source format.
  local ($_) = @_;
  s/\\/\\\\/go;			# First replace \ to avoid interference...
  s/\"/\\\"/go;
  s/\n/\\n/go; s/\r/\\r/g; s/\t/\\t/g; s/\f/\\f/go;
  s/[\0-\037\200-\377]/ sprintf("\\%.3o",ord($&)) /ge;
  $_
}


sub variablify { # Change XML Name to legal C variable name.
  local ($_) = @_;
  s|-|_d_|go;
  s|:|_c_|go;
  s|/|__|go;
  $_
}


sub redistribute { # Print C comment with generated file "license".
                   # Notice that this is not intended to affect
                   # flexml.pl itself.
  my ($pre) = @_;
  print <<EOT;
$pre This program was generated with the FleXML XML processor generator.
$pre FleXML is Copyright © 1999-2005 Kristoffer Rose.  All rights reserved.
$pre ($Id).
$pre 
$pre There are two, intertwined parts to this program, part A and part B.
$pre
$pre Part A
$pre ------
$pre 
$pre Some parts, here collectively called "Part A", are found in the 
$pre FleXML package.  They are Copyright © 1999-2005 Kristoffer Rose. 
$pre All rights reserved.
$pre
$pre You can redistribute, use, perform, display and/or modify "Part A"
$pre provided the following two conditions hold:
$pre
$pre 1. The program is distributed WITHOUT ANY WARRANTY from the author of
$pre    FleXML; without even the implied warranty of MERCHANTABILITY or
$pre    FITNESS FOR A PARTICULAR PURPOSE.
$pre
$pre 2. The program distribution conditions do not in any way affect the
$pre    distribution conditions of the FleXML system used to generate this
$pre    file or any version of FleXML derived from that system.
$pre
$pre Notice that these are explicit rights granted to you for files
$pre generated by the FleXML system.  For your rights in connection with
$pre the FleXML system itself please consult the GNU General Public License.
$pre 
$pre Part B
$pre ------
$pre 
$pre The other parts, here collectively called "Part B", and which came 
$pre from the DTD used by FleXML to generate this program, can be 
$pre distributed (or not, as the case may be) under the terms of whoever
$pre wrote them, provided these terms respect and obey the two conditions 
$pre above under the heading "Part A".
$pre
$pre The author of and contributors to FleXML specifically disclaim
$pre any copyright interest in "Part B", unless "Part B" was written 
$pre by the author of or contributors to FleXML.
$pre 
EOT
}


sub api_functions { # Print XML application interface functions.
  my ($pre,$post) = @_;
  print "/* XML application entry points. */\n" if @tags;
  for (@tags) {
    print $pre . "void STag_$ctag{$_}(void)$post\n" unless $startok{$_};
    print $pre . "void ETag_$ctag{$_}(void)$post\n" unless $endok{$_};
  }
}


sub api_types { # Print XML application interface types.
  print "/* XML application data. */\n" if %atttype;
  for (keys %atttype) {
    if (m.($Nmtoken)[/]($Nmtoken).xo) {
      my ($tag,$attribute) = ($1,$2);
      print "typedef $typeof{$_} AT_$ctag{$tag}_$catt{$attribute};\n";
      print "#define AU_$ctag{$tag}_$catt{$attribute} NULL\n"
	if not $enumtype{$_};
    }
  }
}


sub api_data { # Print XML application interface parameters.
  my ($pre) = @_;
  print "/* FleXML-provided data. */\n";
  print $pre . "const char* pcdata;\n";
  for (keys %atttype) {
    if (m.($Nmtoken)[/]($Nmtoken).xo) {
      print $pre . "AT_$ctag{$1}_$catt{$2} A_$ctag{$1}_$catt{$2};\n";
    }
  }
}


sub expandparametersat { # Expand parameter entities in $_ at $place.
  my ($place) = @_;
  while ( m/$place\s*%($Name);/ ) {
    if ($parameter{$1}) {
      s/$place(\s*)%($Name);/ ' '. $1 . $parameter{$2} . ' ' /xe;
    }
  }
}


sub geturl { # Insert contents of URL into input stream at current point

  my ($url) = @_;
  $url = "file:$url" if not $url =~ m/:/;
  local $_ = $url;
  s.([^/:]*[/:])*..g;
  s/\.dtd$//;

  print STDOUT "Fetching URL <$url>.\n" if $verbose;
  my $ua = new LWP::UserAgent;  # Create a user agent object
  $ua->agent("FleXML/1 " . $ua->agent);
  my $req = new HTTP::Request GET => $url; # Create a request
  my $res = $ua->request($req);	# Pass request to the user agent and get a response
  if ($res->is_error) {
    die place()."URL <$url> could not be fetched.\n";
  }

  # Make the data available...
  push @inputs, [ split /\r?\n/, $res->content ];
  if (@{$inputs[$#inputs]}) {
    push @inputnames, "$url";
    push @inputlinenos, 0;
  }
  else {
    pop @inputs;		# oops, empty.
  }

#  print STDOUT "Testing:\n";
#  my $i = 0;
#  for my $ref (@inputs) {
#    print STDOUT " inputs[" . $i++ . "] =\n";
#    for my $line (@{$ref}) {
#      print " | $line\n";
#    }
#  }
#  $i = 0;
#  for my $name (@inputnames) {
#    print STDOUT " inputnames[" . $i++ . "] = `$name'\n";
#  }
#  $i = 0;
#  for my $no (@inputlinenos) {
#    print STDOUT " inputlinenos[" . $i++ . "] = `$no'\n";
#  }
}


sub nextline {			# return one input line

  return undef unless @inputs;

  my $line = shift @{$inputs[$#inputs]};
  $inputlinenos[$#inputs]++;

  while (@inputs and not @{$inputs[$#inputs]}) { # discard exhausted inputs
    pop @inputs;
    pop @inputnames;
    pop @inputlinenos;
  }

  return $line;
}


sub place {
  if (@inputs) {
    local $_ = "\"$inputnames[$#inputs]\", line $inputlinenos[$#inputs]: ";
    s/"file:/"/;
    return $_;
  }
  else {
    return "";
  }
}


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# extractcp($str) - Split argument in one cp[48] (content particles) and the rest.
#
# returns ($cp,$rest)
#
sub extractcp { 

  # $_ stores the remainder of the string we're looking at
  local ($_) = @_;
  
  # shack - I noticed that the 'match one Name' pattern below
  # does not accept leading spaces, but the ( pattern does
  # This is a little warning that will enlighten me if this occurs
  m/^\s+/ and do {
    cluck("extractcp matching leading spaces!  '$_'");
  };

  if ( m/^($Name[+?*]?)\s*/o ) {    # match one Name
    return ($1,$'); #'
  }
    
  if ( m/^\s*\(/o ) {               # match the start of a choice or seq
  
    # build up one CP my matching parens
    my $cp = '('; $_ = $'; #'
    my $level = 1;

    # match nested parenthesis
    while ($level > 0 and $_) {

      if ( m/^\s*\(\s*/o ) {              # open paren
	$level++; 
        $cp .= '('; 
        $_ = $'; #'
      }
      elsif ( m/^\s*(\)[+?*]?)\s*/o ) {   # close paren
	$level--; 
        $cp .= $1; 
        $_ = $'; #'
      }
      elsif ( m/^\s+/o ) {                # skip white space
	$_ = $'; #'
      }
      else {                              # everything else
	m/[^()\s]+/o; 
        $cp .= $&; 
        $_ = $'; #'
      }
    }
    return ($cp,$_);
  }

  confess ("should not get here!");
}


my $statecounter;

sub analysechildren { # Analyse DTD children specification; return
                      # true if it may be empty.  Uses global $statecounter.

  my ($tag,$re,$in,$out) = @_;

  #print "analysechildren [ $tag, $re, $in, $out ] \n";

  local $_ = $re;

  if ( m/^\s*($Name)\s*$/o ) { # tag

    my $thetag = $1;
    my %ins = ();
    if (exists $instates{$thetag}) {
      for (split /,/,$instates{$thetag}) { $ins{$_} = 'true'; }
    }
    $ins{$in} = 'true';
    $instates{$thetag} = join(',',keys %ins);
    $exittrans{$thetag} .= ($exittrans{$thetag}?',':'') . "$in=>$out" if $in ne $out;
    return undef;

  }
  elsif ( m/^((.|\n)+)\?\s*$/o ) { # re ?

    $emptytrans{$in} .= ($emptytrans{$in}?',':'') . $out unless $in eq $out;
    analysechildren($tag,$1,$in,$out);
    return 'true';

  }
  elsif ( m/^((.|\n)+)\+\s*$/o ) { # re +

    my $re = $1;
    my $s1 = "S_$ctag{$tag}_" . (++$statecounter);
    my $s2 = "S_$ctag{$tag}_" . (++$statecounter);
    $states{$tag} .= ",$s1,$s2";
    $emptytrans{$in} .= ($emptytrans{$in}?',':'') . $s1;
    $emptytrans{$s2} = "$s1,$out";
    return analysechildren($tag,$re,$s1,$s2);

  }
  elsif ( m/^((.|\n)+)\*\s*$/o ) { # re *

    return analysechildren($tag,"$1+?",$in,$out);

  }

  elsif ( m/^\s*\(\s*((.|\n)+)\s*\)\s*$/xo ) { # choice or seq

    local $_;
    my $cp;
    ($cp,$_) = extractcp($1);

    if ( m/^\s*$/ ) { # () with single member.

      return analysechildren($tag,$cp,$in,$out);

    }
    elsif ( m/^\s*([|,])\s*/m ) {
      my $type = "[$1]";
      my $maybeempty = ($type eq '[,]');
      my $state = $in;

      while ( m/^\s*$type\s*/ ) {
	$_ = $'; #'

	if ($type eq '[|]') { # $cp is choice
	  $maybeempty = 'true' if analysechildren($tag,$cp,$in,$out);
	}
	else { # $cp is seq component
	  my $oldstate = $state;
	  $state = "S_$ctag{$tag}_" . (++$statecounter);
	  $states{$tag} .= ",$state";
	  $maybeempty = undef unless analysechildren($tag,$cp,$oldstate,$state);
	}
	($cp,$_) = extractcp($_);
      }

      # Last cp needs special treatment in sequence.
      if ($type eq '[|]') { # $cp is choice
        $maybeempty = 'true' if analysechildren($tag,$cp,$in,$out);
      }
      else { # $cp is seq component
	$maybeempty = undef unless analysechildren($tag,$cp,$state,$out);
      }

      $emptytrans{$in} .= ($emptytrans{$in}?',':'') . $out
	if $maybeempty and $in ne $out;
      return $maybeempty unless $_;
    }
  }
  die place()."DTD element `$tag' has nonsense fragment `$_'.\n";
}


# OPTIONS PROCESSING (explained in manual).


# Parse options.
$Use = "Usage: flexml [-ASHDvdqnLXV] [-s skel] [-p pubid] [-u uri]\n"
     . "       [-b stack_size] [-r roottags] [-a actions] name[.dtd]";

getopts('ASHDvdnLXVqp:b:s:u:r:a:', \%opt);

# Version!
print "FleXML version $Id.\n" if $opt{V} or $opt{v};
exit 0 if $opt{V};

# Debugging?
$debug = $opt{d};
$verbose = $opt{v};

# Line numbers?
$lineno = $opt{L};

# Quiet parser?    
$quiet_parser = $opt{q};
    
# Exit without fail message?
$nofail = $opt{X};

# Specific root tags?
if ($opt{r}) {
  for (split ',',$opt{r}) { $roottags{$_} = 'true'; }
}

# Specific stack size?
$stacksize = ($opt{'b'} ? $opt{'b'} : 100000);

# Set skeleton scanner file name and check it is readable (if needed).
$SKELETON = ($opt{'s'} ? $opt{'s'} : './skel');
die "$0: No skeleton file $SKELETON.\n" if not -r $SKELETON and $opt{S};

# Set document type URI and PUBID.
$uri = $opt{u} if $opt{u};
$pubid = $opt{p} if $opt{p};

# Set DTD file name...and extract prefix for later
my $prefix = $ARGV[0];
if (defined $prefix and $#ARGV == 0) {
  $prefix =~ s/\.dtd$//;
  $dtd = "$prefix.dtd";		# Require .dtd extension on DTD
  geturl($dtd);			# Read the DTD
  $prefix =~ s|^([^:/]*[:/])*||;
  $cdtd = variablify($prefix);
}
else {
  die "$Use\n";
}  

# Selection options: If none of -SHDA specified then default to -SH.
# Furthermore -a implies -D.
$opt{S} = $opt{H} = 'true' unless ($opt{S} or $opt{H} or $opt{D} or $opt{A});
$opt{D} ||= $opt{a} unless $opt{A};

# Set default (DTD-based) output file names.
$SCANNER = "$prefix.l";
$HEADER =  "$prefix.h";
$APPLICATION = "$prefix-dummy.c";

# Set actions=based output file names, if any.
if ($ACTIONS = $opt{a}) {
  $opt{a} =~ s/\.[a-z]+$//;
  $APPLICATION = "$opt{a}.c";
}

# Stand-alone applications...
if ($opt{A}) {
  die "$0: -A conflicts with -SHD.\n" if ($opt{S} or $opt{H} or $opt{D});
  $SCANNER = $APPLICATION;
  $SCANNER =~ s/\.c$/.l/;
}

# Dry-run?
if ($opt{n}) {
  $opt{A} = $opt{S} = $opt{H} = $opt{D} = undef;
}


# PARSE DTD.


print STDOUT "Processing DTD in $dtd.\n" if $verbose;

$_ = '';			# Current entry

while (@inputs) {		# While there are lines...

  my $orig = "$_";		# current source line(s)

  # Skip spaces and complete comments (but save as source).
  do {
    # Skip blank lines.
    $_ = nextline() while @inputs and m/^\s*$/;

    # If we're looking at a parameter or external entity then expand it.
    if ( m/^\s*%($Name);/ ) {
      my $ent = $1;
      if ($parameter{$ent}) {
        expandparametersat('^');
      }
      elsif ($external{$ent}) {
        $_ = $'; #'
        geturl($external{$ent});
      }
      else {
        die "Unknown entity `%$ent;'\n";
      }
    }

    # Skip (but save) comments.
    while ( m/^\s*<!--/ ) {
      until (m/-->/ or not @inputs) { $_ .= "\n" . nextline(); }
      # Extract first DTD version number...
      if ( not $dtdrevision and /\$(Id|Header|Revision): [^\$]*\$/ ) {
	$dtdrevision = "$&";
	$dtdrevision =~ s/\s*\$\s*//go;
      }
      # Remove the comment to read on to next nonblank (but save as
      # source). 
      $orig .= ($orig?"\n":"").$1 if s/^\s*(<!--([^-]|-[^-]|--[^>])*-->)\s*//;
    }
    s/^\s*//;
  } until $_ or not @inputs;

  # If we're looking at a parameter or external entity then expand and retry.
  if ( m/^\s*%($Name);/ ) {
    my $ent = $1;
    if ($parameter{$ent}) {
      expandparametersat('^');
    }
    elsif ($external{$ent}) {
      $_ = $'; #'
      geturl($external{$ent});
      next;
    }
    else {
      die "Unknown entity `\%$ent;'\n";
    }
  }

  die place()."Nonsense `$_'.\n" if /^[^<]/ or /^<[^!]/;

  # Read on until a full DTD <!...> or <?...?> entry is available.
  until (m/^\s*<![^>]*>/o or m/^\s*<[?]([^?]|[?][^>])*[?]>/o or not @inputs) {
    my $line = nextline();
    $orig .= ($orig?"\n":"") . $line;
    $_ .= "\n" . $line;
  }

  unless ( m/^\s*<![^>]*>/o or m/^\s*<[?]([^?]|[?][^>])*[?]>/o ) {
    last if not @inputs;
    die place()."Could not find end of declaration.\n";
  }

  # Clean out in $orig.
  $orig =~ s/\n+/\n/g;
  $orig =~ s/^\n*//g;
  $orig =~ s/\n*$//g;

  print STDOUT " [$_]\n" if $debug;

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
  # Processing instruction.
  if ( m/^\s*<[?]([^?]|[?][^>])*[?]>\s*/o ) {
    print STDERR place()."Warning: ignoring processing instruction $&.\n";
    $_ = $'; #'
  }

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
  # Parse element declarations.
  elsif ( m/^<!ELEMENT\s+($Name)\s+([^>]*)>\s*/xo ) {

    my ( $tag, $token_source ) = ( $1, $2 );

    # strip off the matched code from the beginning of $_
    $_ = $'; #'

    # ????? - most of the time $orig is undef
    $source{$tag} = "$orig";

    # place the newly found tag into the list of tags @tags
    die place()."Repeated element $tag.\n" if $ctag{$tag};
    push @tags, $tag;

    # Create C-friendly tag names.
    $ctag{$tag} = variablify($tag) unless $ctag{$tag};
    my $c = $ctag{$tag};

    # start looking at the token_source ($2)
    local $_ = $token_source;
    expandparametersat(''); s/^\s+//;

    # All elements should be followed by nothing when at the root.
    # IF there is a list of roottags ($0 -r ..,..), then only add
    # the exittrans if it is in the list
    $exittrans{$tag} .= ($exittrans{$tag}?',':'') . "ROOT_$c=>EPILOG"
      if not %roottags or $roottags{$tag};

    # Handle element declaration.
    if ( m/^EMPTY\s*$/o ) {
      $empty{$tag} = 'true';
      $properempty{$tag} = 'true';
      $states{$tag} = "E_$c";
      $startstate{$tag} = "E_$c";
      $endstates{$tag} = "E_$c";
    }
    elsif ( m/^ANY\s*$/o ) {
      $any{$tag} = 'true';
      $mixed{$tag} = 'true';
      $empty{$tag} = 'true';
      $states{$tag} = "IN_$c";
      $startstate{$tag} = "IN_$c";
      $endstates{$tag} = "IN_$c";
    }
    elsif ( m/^\(\s*\#PCDATA\s*\)\s*$/o
	    or m/^\(\s*\#PCDATA\s*((\|\s*$Name\s*)*)\)\*\s*$/xo ) {
      $mixed{$tag} = 'true';
      $empty{$tag} = 'true';
      if ($1) {
	my $desc = $1;
	$desc =~ s/^\s*\|\s*//o;
	for (split /\s*\|\s*/,$desc) {
	  $instates{$_} .= ($instates{$_}?',':'') . "IN_$c";
	  $inmixed{$_} = 'true';
	}
      }
      $states{$tag} = "IN_$c";
      $startstate{$tag} = "IN_$c";
      $endstates{$tag} = "IN_$c";
    }
    else {
      $children{$tag} = 'true';
      $statecounter = 0;
      $states{$tag} = "S_$c";
      $startstate{$tag} = "S_$c";
      $empty{$tag} = 'true' if analysechildren($tag,$_,"S_$c","E_$c");
      $states{$tag} .= ",E_$c";
      $endstates{$tag} = "E_$c";
    }
  }

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
  # Parse attribute declarations.
  elsif ( m/^<!ATTLIST\s+($Name)\s+([^>]*)>\s*/o ) {

    $_ = $'; #'
    {
      my $tag = $1;
      local $_ = $2;

      expandparametersat(''); s/^\s+//;

      # Repeat while there are attribute declarations.
      while ( s/^($Name)\s+([A-Z]+|\(\s*$Nmtoken\s*(?:\|\s*$Nmtoken\s*)*\))
		(?:\s+(\#IMPLIED|\#REQUIRED|(?:\#FIXED\s+)?$Literal))?\s*//xo ) {

	my ($attribute,$type,$default) = ($1,$2,$3);

	if ($atttype{"$tag/$attribute"}) {
	  print place()."Warning: Redeclaration of element $tag attribute $attribute ignored.\n";
	}
	else {
	  if ($orig) { # to only print the source once once
	    $source{"$tag/$attribute"} = "$orig";
	    $orig = '';
	  }

	  $ctag{$tag} = variablify($tag) unless $ctag{$tag};
	  $catt{$attribute} = variablify($attribute) unless $catt{$attribute};

	  # Add atribute to the appropriate lists.
	  $attlist{$tag} .= ($attlist{$tag}?',':'') . "$attribute";
	  if ($withattr{$attribute}) {
	    $withattr{$attribute} .= ",$tag"; 
	  }
	  else {
	    push @attributes, $attribute;
	    $withattr{$attribute} = "$tag"; 
	  }

	  # Analyse default value.
	  if (defined $default) {
	    if ($default eq '#REQUIRED') {
	      $required{"$tag/$attribute"} = 'true';
	      $default = undef;
	    }
	    elsif ($default eq '#IMPLIED') {
	      $default = undef;
	    }
	    else {
	      $fixed{"$tag/$attribute"} = 'true' if $default =~ s/\#FIXED\s+//o;
	      $default =~ s/^'([^'']*)'$/$1/o unless $default =~ s/^"([^""]*)"$/$1/o;
	    }
	  }

	  # Store attribute default string and type.
	  $attdef{"$tag/$attribute"} = $default if $default;
	  $atttype{"$tag/$attribute"} = $type;

	  # Handle enumeration types...
	  if ( $type =~ m/^\(((.|\n)*)\)$/x ) {
	    local $_ = $1;
	    s/\s+//go;
	    s/\|/,/go;
	    $enumtype{"$tag/$attribute"} = "$_";
	    s/$Nmtoken/ "A_$ctag{$tag}_$catt{$attribute}_" . variablify($&) /xge;
	    my $undefined = "AU_$ctag{$tag}_$catt{$attribute}";
	    s/^/enum \{ $undefined, /o;
	    s/$/ \}/o;
	    $typeof{"$tag/$attribute"} = "$_";
	    if ($default) {
	      $initof{"$tag/$attribute"} = "A_$ctag{$tag}_$catt{$attribute}_"
					 . variablify($default);
	    }
	    else {
	      $initof{"$tag/$attribute"} = "$undefined";
	    }
	  }
	  # ...and string/token types.
	  else {
	    $typeof{"$tag/$attribute"} = 'const char*';
	    if ($default) {
	      $initof{"$tag/$attribute"} = "\"$default\"";
	    }
	    else {
	      $initof{"$tag/$attribute"} = 'NULL';
	    }

	    # Special treatment of token types.
	    if ( $type eq 'ID' or $type eq 'IDREF' ) {
	      $literaltype{"$tag/$attribute"} = '{Name}';
	      print STDERR place()."Warning: attribute type `$type' not validated.\n";
	    }
	    elsif ( $type eq 'IDREFS' ) {
	      $literaltype{"$tag/$attribute"} = '{Names}';
	      print STDERR place()."Warning: attribute type `$type' not validated.\n";
	    }
	    elsif ( $type eq 'NMTOKEN' ) {
	      $literaltype{"$tag/$attribute"} = '{Nmtoken}';
	    }
	    elsif ( $type eq 'NMTOKENS' ) {
	      $literaltype{"$tag/$attribute"} = '{Nmtokens}';
	    }
	    elsif ( $type eq 'ENTITY' ) {
	      die place()."ENTITY attribute type unimplemented.\n";
	    }
	    elsif ( $type eq 'ENTITIES' ) {
	      die place()."ENTITIES attribute type unimplemented.\n";
	    }
	    elsif ( $type ne 'CDATA' ) {
	      die place()."Unknown AttType `$type'.\n";
	    }

	  }
	}

	expandparametersat('^'); s/^\s+//; # to expand next set of declarations...
      }

      die place()."Nonsense ($_) in attribute list.\n" if $_;
    }

    $orig = '';			# in case there were no attributes...
  }

  # Parse internal parameter entity declaration.
  elsif ( m/^\s*<!ENTITY\s+%\s+($Name)\s+'([^'']*)'\s*>\s*/xo
	  or m/^\s*<!ENTITY\s+%\s+($Name)\s+"([^""]*)"\s*>\s*/xo ) {

    $_ = $'; #'
    $source{"%$1;"} = "$orig"; $orig = ''; # cycle

    my $name = $1;
    local $_ = $2;

    die "Entity `%$name;' doubly defined.\n" if $parameter{$name} or $external{$name};
    
    expandparametersat(''); s/^\s+//; s/\s+$//;
    s/\&\#([0-9]+|x[0-9a-fA-F]+);/
      (substr($1,0,1) eq 'x' ? chr(hex(substr($1,1))) : chr($1)) /ge;
    $parameter{$name} = $_;
  }

  # Parse external parameter entity declaration.
  elsif ( m/^<!ENTITY\s+%\s+($Name)\s+SYSTEM\s+'([^'']*)'\s*>\s*/xo
	  or m/^<!ENTITY\s+%\s+($Name)\s+SYSTEM\s+"([^""]*)"\s*>\s*/xo
	  or m/^<!ENTITY\s+%\s+($Name)\s+PUBLIC\s+$Literal\s+'([^'']*)'\s*>\s*/xo
	  or m/^<!ENTITY\s+%\s+($Name)\s+PUBLIC\s+$Literal\s+"([^""]*)"\s*>\s*/xo ) {

    $_ = $'; #'
    $source{"%$1;"} = "$orig"; $orig = ''; # cycle

    my $name = $1;
    die "Entity `%$name;' doubly defined.\n" if $parameter{$name} or $external{$name};
    $external{$name} = $2;
  }

  # Parse internal general entity declarations.
  elsif ( /^<!ENTITY\s+($Name)\s+['"]([^''""]*)["']\s*>\s*/xo ) {

    $_ = $'; #'
    $source{"&$1;"} = "$orig"; $orig = ''; # cycle

    my $name = $1;
    local $_ = $2;

    die "Entity `&$name;' doubly defined.\n" if $entity{$name};
    s/\&\#([0-9]+|x[0-9a-fA-F]+);/
      (substr($1,0,1) eq 'x' ? chr(hex(substr($1,1))) : chr($1)) /ge;
    $entity{$name} = cquote($_);
  }

  # Unrecognised declaration.
  else {
    die place()."Unrecognized declaration.\n";
  }

}

# Post-process DTD.

print STDOUT "Post-processing DTD.\n" if $verbose;

# Add transitive empty transitions until none can be found.
{ my $changed = 'true';
  while ($changed) {
    $changed = undef;
    for my $from (keys %emptytrans) {
      my %tos = (); for (split /,/,$emptytrans{$from}) { $tos{$_} = 'true'; }
      for my $to (keys %tos) {
	if (exists $emptytrans{$to}) {
	  for my $next (split /,/,$emptytrans{$to}) {
	    if (not $tos{$next}) {
	      $changed = $tos{$next} = 'true';
	    }
	  }
	}
      }
      $emptytrans{$from} = join ',',keys %tos;
    }
  }
}

# Complete all states with their `empty' equivalents.
for my $tag (@tags) {

  # Complete and prune instates and endstates...
  my %ins = ();
  if (exists $instates{$tag}) {
    for (split ',',$instates{$tag}) { $ins{$_} = 'true'; }
  }
  my %ens = ();
  if (exists $endstates{$tag}) {
    for (split ',',$endstates{$tag}) { $ens{$_} = 'true'; }
  }

  # ...and exit transitions...
  my %exits = ();
  if (exists $exittrans{$tag}) {
    for (split ',',$exittrans{$tag}) { m/^($Name)=>($Name)$/o; $exits{$1} = $2; }
  }

  # Encode ANY as Mixed contents with all tags permitted.
  for (keys %any) { $ins{"IN_$_"} = 'true'; }

  # For each empty transition A->B add A where B occurs.
  for my $from (keys %emptytrans) {
    for my $to (split /,/,$emptytrans{$from}) {
      $ins{$from} = 'true' if $ins{$to};
      $ens{$from} = 'true' if $ens{$to};
      $exits{$from} = $exits{$to} if $exits{$to};
    }
  }

  # Completion done...now store'em right back.
  $instates{$tag} = join ',', keys %ins if %ins;
  $endstates{$tag} = join ',', keys %ens if %ens;
  $exittrans{$tag} = join ',', map "$_=>$exits{$_}", keys %exits if %exits;
}

if (not %roottags) {
  for (@tags) { $roottags{$_} = 'true'; }
}


# Debugging: show DTD representation.

if ($debug) {

  print STDOUT "DTD debug output:\n" if $verbose;

  sub printhash {
    my ($name) = @_;

    my ($k,$v);
    my $out = '';

    while ( ($k, $v) = each(%$name) )
    {
      $out .= "\n    $k => " . ($v || "undef"); 
  }

    return $out;
  }

  print '%opt = (' . printhash(\%opt) . ")\n";

  print '%source = (' . printhash(\%source) . ")\n";
  print "\n";

  print '@tags = (' . join(',',@tags) . ")\n";
  print '%ctag = (' . printhash(\%ctag) . ")\n";

  print '%states = (' . printhash(\%states) . ")\n";
  print '%instates = (' . printhash(\%instates) . ")\n";
  print '%endstates = (' . printhash(\%endstates) . ")\n";

  print '%emptytrans = (' . printhash(\%emptytrans) . ")\n";
  print '%exittrans = (' . printhash(\%exittrans) . ")\n";
  print '%roottags = (' . printhash(\%roottags) . ")\n";

  print '%empty = (' . printhash(\%empty) . ")\n";
  print '%properempty = (' . printhash(\%properempty) . ")\n";
  print '%any = (' . printhash(\%any) . ")\n";
  print '%mixed = (' . printhash(\%mixed) . ")\n";
  print '%children = (' . printhash(\%children) . ")\n";
  print '%inmixed = (' . printhash(\%inmixed) . ")\n";
  print "\n";

  print '@attributes = (' . join(',',@attributes) . ")\n";
  print '%catt = (' . printhash(\%catt) . ")\n";

  print '%atttype = (' . printhash(\%atttype) . ")\n";
  print '%enumtype = (' . printhash(\%enumtype) . ")\n";
  print '%literaltype = (' . printhash(\%literaltype) . ")\n";
  print '%typeof = (' . printhash(\%typeof) . ")\n";

  print '%attdef = (' . printhash(\%attdef) . ")\n";
  print '%required = (' . printhash(\%required) . ")\n";
  print '%fixed = (' . printhash(\%fixed) . ")\n";
  print '%initof = (' . printhash(\%initof) . ")\n";

  print '%attlist = (' . printhash(\%attlist) . ")\n";
  print '%withattr = (' . printhash(\%withattr) . ")\n";
  print "\n";

  print '%entity = (' . printhash(\%entity) . ")\n";
  print '%parameter = (' . printhash(\%parameter) . ")\n";
  print '%external = (' . printhash(\%external) . ")\n";
}


# WRITE API HEADER (if requested).


if ($opt{H}) {

  print STDOUT "Generating XML processor header in `$HEADER'.\n" if $verbose;

  open HEADER, "+>$HEADER" || die "$0: cannot write $HEADER: $!\n";
  select HEADER;

  # Identification and license.
  print "/* XML processor/application API for $dtd"
      . ($dtdrevision ? " ($dtdrevision)" : "") . ".\n";
  print " * Generated " . `date +'%Y/%m/%d %T.'`;
  print " *\n";
  redistribute(" *");
  print " */\n";
  print "\n";

  # Output the declarations safeguarded againts repeated loading.
  print "#ifndef _FLEXML_${cdtd}_H\n";
  print "#define _FLEXML_${cdtd}_H\n";
  print "\n";
  api_functions('extern ',';');
  print "\n";
  api_types();
  print "\n";
  api_data('extern ');
  print "\n";

  print "/* XML application utilities. */\n";
  print "extern int element_context(int);\n";
  print "\n";
  print "/* XML processor entry point. */\n";
  print "extern int yylex(void);\n";
  print "\n";
  
  print "/* Flexml error handling function (useful only when -q flag passed to flexml) */\n";
  print "const char * parse_err_msg(void);\n";
    
  print "#endif\n";

  close HEADER || die "$0: cannot read $HEADER: $!\n"; 
}


# WRITE XML PROCESSOR (if requested).


if ($opt{S} or $opt{A}) {

  print STDOUT "Writing XML processor"
             . ($opt{a} || $opt{A} ? " and application" : "")
             . " onto `$SCANNER'.\n" if $verbose;

  open SCANNER, "+>$SCANNER"|| die "$0: cannot write $SCANNER: $!\n";
  select SCANNER;

  open (SKELETON) || die "$0: cannot read $SKELETON: $!\n";

  # Identification and license.
  print "/* Validating XML processor for $dtd"
      . ($dtdrevision ? " ($dtdrevision)" : "") . ".\n";
  print " * Generated " . `date +'%Y/%m/%d %T.'`;
  print " *\n";
  redistribute(" *");
  print " */\n";
  print "\n";

  # Skip initial comment.
  while (<SKELETON>) { last if m/^\%\{/; }
  print;

  # Copy body of skeleton scanner with substitutions...
  while (<SKELETON>) {

    if ( /^FLEXML_VERSION$/ ) {

      print "const char rcs_flexml[] =\n"
	  . " \"\$\" \"$Id \$\";\n";

      print "const char rcs_${cdtd}_dtd[] =\n"
	  . " \"\$\" \"$dtdrevision \$\";\n" if $dtdrevision;

    }
    elsif ( /^FLEXML_DEFINITIONS$/ ) {

      print "#define DEBUG\n" if $debug;
      print "#define FLEXML_yylineno\n" if $lineno;
      print "#define FLEXML_NOFAIL\n" if $nofail;
      print "#define FLEXML_quiet_parser\n" if $quiet_parser;
      print "#define FLEXML_HasMixed\n" if %inmixed;
      print "#define FLEXML_BUFFERSTACKSIZE $stacksize\n";
      print "\n";

      if ($opt{A}) {
	api_functions('static ',';');
	print "\n";
	api_types();
	print "\n";
	api_data('static ');
      }
      else {
	print "/* XML processor api. */\n";
	print "#include \"$HEADER\"\n\n"
	    if ($opt{H});
	api_data('');
      }

    }
    elsif ( /^FLEXML_FLEX_OPTIONS$/ ) {

      print "%option yylineno\n" if $lineno;
      print "%option debug\n" if $debug;
      print "%option nounput\n" if not %entity;

    }
    elsif ( /^FLEXML_START_CONDITIONS$/ ) {

      for (@tags) {
	my $c = $ctag{$_};
	print "%x"
            . ($roottags{$_} ? " ROOT_$c" : "")
            . " AL_$c " . join(' ',split(',',$states{$_})) . "\n";
      }

    }
    elsif ( /^FLEXML_EXTRA_DEFINITIONS$/ ) {

     print "%{\n";
     print "/* State names. */\n";
     print "const char* *statenames=NULL;\n";
     print "%}\n";

    }
    elsif ( /^FLEXML_EXTRA_DEFINITIONS_INIT$/ ) {

      my ($state, $tag);

      print "  /* FleXML_init */\n";
      print "  next = bufferstack;\n";
      print "  if(!statenames) {statenames= (const char **)calloc(IMPOSSIBLE,sizeof(char*));\n";
      for ('PROLOG','DOCTYPE','EPILOG','INCOMMENT','INPI','VALUE1','VALUE2','CDATA') {
	print "  statenames[$_] = NULL;\n";
      }
      for my $tag (@tags) {
	my $c = $ctag{$tag};
	print "  statenames[ROOT_$c] = NULL;\n" if $roottags{$tag};
	print "  statenames[AL_$c] = NULL;\n";
	for (split ',',$states{$tag}) {
	  print "  statenames[$_] = \"$tag\";\n";
	}
      }
      print "  }\n";
    }
    elsif ( /^FLEXML_DOCTYPES$/ ) {

      my $sysid = ( $uri ? "(\"'$uri'\"|\"\\\"$uri\\\"\")"
			    : "(\"'$dtd'\"|\"\\\"$dtd\\\"\")" );

      for (keys %roottags) {
	my $c = $ctag{$_};
	print " \"<!DOCTYPE\"{S}\"$_\"{S}"
	    . "SYSTEM{S}" . $sysid . "{s}\">\" SET(ROOT_$c);\n";
	if ($pubid) {
	  print " \"<!DOCTYPE\"{S}\"$_\"{S}"
	      . "PUBLIC{S}(\"'$pubid'\"|\"\\\"$pubid\\\"\"){S}"
	      . $sysid . "{s}\">\" SET(ROOT_$c);\n";
	}
      }	  

    }
    elsif ( /^FLEXML_RULES$/ ) {

      # Dump all parameter entity declarations.
      for (keys %parameter) { printsource($_); }

      # Dump all start and empty tag recognition rules.
      for my $tag (@tags) {

	my $myctag = $ctag{$tag};
	my @myattributes = (exists $attlist{$tag} ? split /,/,"$attlist{$tag}" : ());
	my ($intag, $attribute);

	# Tag's source element and attribute declarations.
	printsource($tag);
	for my $attribute (@myattributes) {
	  printsource("$tag/$attribute");
	}

	# Build element exit transition command.
	my $exitswitch = "";
	if (exists $exittrans{$tag}) {
	  $exitswitch .= "  switch (YY_START) {\n";
	  my %casesto = ();
	  for (split /,/,$exittrans{$tag}) {
	    if (m/^($Name)=>($Name)$/o) {
	      $casesto{$2} .= "case $1: ";
	    }
	  }
	  for (keys %casesto) {
	    $exitswitch .= "   $casesto{$_}SET($_); break;\n" 
	  }
	  $exitswitch .= "  }\n";
	}
	

	# Start or empty tag: initialise attribute list.
	print "\n";
	if ($roottags{$tag}) {
	  print "<ROOT_$myctag" . ($instates{$tag} ? ",$instates{$tag}" : "");
	}
	else {
	  print "<$instates{$tag}";
	}
# SHACK
#	print ">{ \n" .
#              "\"<$tag\"{s} {\n";
	print ">\"<$tag\"{s} {\n";

	for my $attribute (@myattributes) {
	  print "  A_${myctag}_$catt{$attribute} = " . $initof{"$tag/$attribute"} . ";\n";
	}
	print "  ENTER(AL_$myctag);\n";
	print "  }\n";

#	print " .       FAIL(\"Unexpected character `%c': `<$tag' expected.\",yytext[0]);\n";
#	print " <<EOF>> FAIL(\"Premature EOF: `<$tag' expected.\");\n";
#	  #unless $mixed{$tag} or $nofail;
#	print "}\n";

	# Attribute list (of start or empty tag):
	print "\n";
	print "<AL_$myctag>{\n";

	for my $attribute (@myattributes) {
	  my $type; # set by conditions
	  
	  if ($type = $enumtype{"$tag/$attribute"}) {

	    # - fixed enumeration attribute: generate one rule,
	    if ($fixed{"$tag/$attribute"}) {
		print " \"$attribute\"{Eq}\"'" . $attdef{"$tag/$attribute"} . "'\""
		    . " |\n"
		    . " \"$attribute\"{Eq}\"\\\"" . $attdef{"$tag/$attribute"} . "\\\"\""
		    . " A_${myctag}_$catt{$attribute}"
		    . " = " . $initof{"$tag/$attribute"} . ";\n";
	    }
	    else {
	      # - (non-fixed) enumeration attribute: generate a rule per value,
	      for my $alternative (split /,/,$type) {
		print " \"$attribute\"{Eq}\"'$alternative'\""
		    . " |\n"
		    . " \"$attribute\"{Eq}\"\\\"$alternative\\\"\""
		    . " A_${myctag}_$catt{$attribute}"
		    . " = A_${myctag}_$catt{$attribute}_" . variablify($alternative) . ";\n";
	      }
	    }
	  }
	  elsif ($fixed{"$tag/$attribute"}) {

	    # - fixed (non-enumeration) attribute: generate one rule per literal form,

	    print " \"$attribute\"{Eq}\"'" . $attdef{"$tag/$attribute"} . "'\""
	        . " |\n"
		. " \"$attribute\"{Eq}\"\\\"" . $attdef{"$tag/$attribute"} . "\\\"\""
		. " A_${myctag}_$catt{$attribute}"
		. " = " . $initof{"$tag/$attribute"} . ";\n";

	  }
	  elsif ($type = $literaltype{"$tag/$attribute"}) {

	    # - (non-fixed) literal-type attribute: scan literal string directly, or
	    print " \"$attribute\"{Eq}\'$type\' BUFFERLITERAL('\\\'',A_${myctag}_$catt{$attribute});\n";

	    print " \"$attribute\"{Eq}\\\"$type\\\" BUFFERLITERAL('\"',A_${myctag}_$catt{$attribute});\n";

	  }
	  else {

	    # - (non-fixed non-literal) attribute: scan string with entity expansion.
	    print " \"$attribute\"{Eq}\\' ENTER(VALUE1); BUFFERSET(A_${myctag}_$catt{$attribute});\n";
	    print " \"$attribute\"{Eq}\\\" ENTER(VALUE2); BUFFERSET(A_${myctag}_$catt{$attribute});\n";

	  }
	  print "\n";
	}
	#
	# - the end of a start tag means to enter the contents after
	#   checking that all required attributes were set.
	print " \">\" {\n";
	for my $attribute (@myattributes) {
	  if ($required{"$tag/$attribute"}) {
	    print "  if (!A_$ctag{$tag}_" . variablify($attribute) . ")"
	        . " FAIL(\"Required attribute `$attribute' not set for `$tag' element.\");\n";
	  }
	}
	print "  LEAVE; STag_$myctag();"
	    . (%inmixed ? ' pushbuffer(pcdata);' : '')
	    . ($mixed{$tag} ? 'BUFFERSET(pcdata)' : 'pcdata = NULL'). ";"
	    . " ENTER($startstate{$tag});\n";
	print " }\n";
	#
	# - accept and handle empty tags straight away,
	if ($empty{$tag}) {
	  print " \"/>\" {\n";
	  for my $attribute (@myattributes) {
	    if ($required{"$tag/$attribute"}) {
	      print "  if (!A_$ctag{$tag}_" . variablify($attribute) . ")"
		  . " FAIL(\"Required attribute `$attribute' not set for `$tag' element.\");\n";
	    }
	  }
	  print "  LEAVE; STag_$myctag();"
	      . (%inmixed ? ' pushbuffer(pcdata);' : '')
	      . ' pcdata = ' . ($mixed{$tag} ? '""' : 'NULL') . ';'
	      . " ETag_$myctag();"
	      . (%inmixed ? ' pcdata = popbuffer();' : '')
	      . "\n";
	  #
	  print $exitswitch;
	  print " }\n";
	}
	elsif (not $nofail) {
	  print " \"/>\" FAIL(\"`$tag' element cannot be empty.\");\n";
	}
	#
	# - spaces are skipped, and
	print " .       FAIL(\"Unexpected character \`%c\' in attribute list of $tag element.\", yytext[0]);\n" unless $nofail;
	#
	# - other stuff is an error.
	print " {Name} FAIL(\"Bad attribute `%s' in `$tag' element start tag.\",yytext);\n" unless $nofail;
	print " <<EOF>> FAIL(\"EOF in attribute list of `$tag' element.\");\n" unless $nofail;
	print "}\n";

	# End tag.
	print "\n";
	print "<$endstates{$tag}>{\n";
	print " \"</$tag\"{s}\">\" {\n";
	print "  LEAVE;\n";
	print "  BUFFERDONE;\n" if $mixed{$tag};
	print "  ETag_$myctag();\n";
	print "  pcdata = popbuffer();\n" if %inmixed;
	print $exitswitch;
	print " }\n";

	# Errors when expecting end tag.
	print " \"</\"{Name}{s}\">\" FAIL(\"Unexpected end-tag `%s': `</$tag>' expected.\",yytext);\n"
	  unless $nofail;
	print " .       FAIL(\"Unexpected character `%c': `</$tag>' expected.\",yytext[0]);\n"
	  unless $mixed{$tag} or $nofail;
	print " <<EOF>> FAIL(\"Premature EOF: `</$tag>' expected.\");\n" unless $nofail;
	print "}\n";

	# Errors when expecting root tag.
	if ($roottags{$tag} and $nofail) {
	  print "\n";
	  print "<ROOT_$myctag>{\n";
	  print " .       FAIL(\"Unexpected character `%c': `$tag' element expected.\",yytext[0]);\n";
	  print " <<EOF>> FAIL(\"EOF in prolog.\");\n";
	  print "}\n";
	}
      }

    }
    elsif ( /FLEXML_MIXED([,>])/ ) {

      if (%mixed) {
	print  "$`" . join(',', map("IN_$ctag{$_}", keys %mixed)) . "$1$'";
      }
      else {
	print "$`IMPOSSIBLE$1$'";
      }

    }
    elsif ( /FLEXML_NON_MIXED([,>])/ ) {

      my $sep = $`;
      for (@tags) {
	print $sep . ($roottags{$_} ? "ROOT_$ctag{$_}," : "")
            . "AL_$ctag{$_}";
	print ",$states{$_}" if $properempty{$_} or $children{$_};
	$sep = ',';
      }
      print "$1$'";

    }
    elsif ( /FLEXML_COMMENTS([,>])/ ) {

      print "$`"
	  . join(',', map(($roottags{$_} ? "ROOT_$ctag{$_}," : "")
			  . "AL_$ctag{$_},$states{$_}", @tags))
	  . "$1$'";

    }
    elsif ( /^FLEXML_ENTITIES$/ ) {

      # Process general entities.
      for my $ent (keys %entity) {
	printsource("%$ent;");
	print " \"&$ent;\" ENTITYTEXT(\"" . $entity{$ent} . "\");\n";
      }

      print " /* Non-defined standard entities... */\n";
      print "\"&amp;\"  BUFFERPUTC('&');\n" unless $entity{"amp"};
      print "\"&lt;\"   BUFFERPUTC('<');\n" unless $entity{"lt"};
      print "\"&gt;\"   BUFFERPUTC('>');\n" unless $entity{"gt"};
      print "\"&apos;\" BUFFERPUTC('\\\'');\n" unless $entity{"apos"};
      print "\"&quot;\" BUFFERPUTC('\"');\n" unless $entity{"quot"};

    }
    elsif ( /^FLEXML_FINAL$/ and not $nofail ) {

      # Catch-all error cases.
      for my $tag (@tags) {
	 for (split ',',$states{$tag}) {
	   print "<$_>{\n";
	   print " .    FAIL(\"Unrecognized `%c' in $_.\",yytext[0]);\n";
	   print " [\\n] FAIL(\"Unrecognized newline in $_.\");\n";
	   print "}\n";
	 }
      }
      for ('PROLOG','DOCTYPE','EPILOG','INCOMMENT','INPI','VALUE1','VALUE2','CDATA','INITIAL','IMPOSSIBLE') {
	 print "<$_>{\n";
	 print " .   FAIL(\"Unrecognized `%c' in $_.\",yytext[0]);\n";
	 print " [\\n] FAIL(\"Unrecognized space in $_.\");\n";
	 print "}\n";
      }

    }
    elsif ( $nofail and /FAIL\(/ ) {
      #ignore
    }
    else {
      s/"\$Id/"\$" "Id/;
      print;
    }

  }

  close SKELETON || die "$0: Cannot close $SKELETON: $!\n";
  unless ($opt{A}) {
    close SCANNER || die "$0: Cannot close $SCANNER: $!\n";
  }
}


# WRITE APPLICATION.


if ($opt{D}) {

  print STDOUT "Writing XML"
             . ($opt{a} ? "" : " dummy")
             . " application onto `$APPLICATION'.\n" if $verbose;

  open APPLICATION, "+>$APPLICATION" || die "$0: Cannot write $APPLICATION: $!\n";
  select APPLICATION;

  # Identification and license.
  print "/* XML application for $dtd"
      . ($dtdrevision ? " ($dtdrevision)" : "") . ".\n";
  print " * Includes actions from $ACTIONS.\n" if $ACTIONS;
  print " * Generated " . `date +'%Y/%m/%d %T.'`;
  print " *\n";
  redistribute(" *");
  print " */\n";
  print "\n";

  # Declarations.
  print "#include \"$HEADER\"\n";
  print "\n";

}

if ($opt{D} or $opt{A}) {

  # Get requested actions.
  if ($ACTIONS) {

    open ACTIONS, "./flexml-act $ACTIONS|" || die "$0: Cannot exec ./flexml-act $ACTIONS: $!\n";

    my ($tag,$attribute);
    my @myattributes;
    my $lineno = 0;
    my $isstart = undef;

    while (<ACTIONS>) {
      ++$lineno;

      if ( m/^\#line ([0-9]+)/ ) {
	$lineno = $1;
      }
      elsif ( m/^void\s+STag_($Name)\(void\)$/xo ) {
	$tag = $1;
	die "\"$ACTIONS\", line $lineno: Unknown element `$tag'.\n" unless $ctag{$tag};
	$startok{$tag} = 'true';
	@myattributes = (exists $withattr{$tag} ? split /,/,"$withattr{$tag}" : ());
	$isstart = 'true';
      }
      elsif ( m|^\}\s+\/\*\s+STag_($Name)\s+\*\/$|xo ) {
	$tag = undef;
	@myattributes = ();
	$isstart = 'true';
      }
      elsif ( m/^void\s+ETag_($Name)\(void\)$/xo ) {
	$tag = $1;
	$endok{$1} = 'true';
	$isstart = undef;
      }

      # Make function names C-friendly (idempotently!)
      s/(\s+[SE])Tag_($Name)\(/$1Tag_$ctag{$2}\(/xg;

      # Replace special annotations with C equivalents.
      if ($tag) {

	while ( s/\{($Name)\}/A_$ctag{$tag}_$catt{$1}/x ) {
	  die "\"$ACTIONS\", line $lineno: Attributes only allowed in start tag.\n"
	    if not $isstart;
	  die "\"$ACTIONS\", line $lineno: Unknown attribute `$1' for <$tag>.\n"
	    if not $atttype{"$tag/$1"};
	}

	while ( s/\{[!]($Name)\}/AU_$ctag{$tag}_$catt{$1}/x ) {
	  die "\"$ACTIONS\", line $lineno: Attributes only allowed in start tag.\n"
	    if not $isstart;
	  die "\"$ACTIONS\", line $lineno: Unknown attribute `$1' for <$tag>.\n"
	    if not $atttype{"$tag/$1"};
	}

	while ( s|\{($Name)=($Name)\}|
		  "A_$ctag{$tag}_$catt{$1}_" . variablify($2); |xe ) {
	  die "\"$ACTIONS\", line $lineno: Attributes only allowed in start tag.\n"
	    if not $isstart;
	  my ($att,$elt) = ($1,$2);
	  die "\"$ACTIONS\", line $lineno: Unknown attribute $1 for <$tag>.\n"
	    if not $atttype{"$tag/$1"};
	  die "\"$ACTIONS\", line $lineno: Attribute $att does not have value $elt for <$tag>.\n"
	    if not $enumtype{"$tag/$att"} =~ m/\b$elt\b/ ;
	}

	while ( s|\{\#(PCDATA)\}|pcdata| ) {
	  die "\"$ACTIONS\", line $lineno: {#PCDATA} only allowed in end tag.\n"
	    if $isstart;
	  die "\"$ACTIONS\", line $lineno: {#PCDATA} only permitted in end tag with Mixed contents.\n"
	    if not $mixed{$tag};
	}

	die "\"$ACTIONS\", line $lineno: Malformed annotation `$&' in <$tag> action.\n"
	  if m|\{[^;\s]+\}|o;
      }
      print $_;

    }
    close ACTIONS || die "$0: Cannot close pipe to flexml-act: $!\n";
    print "\n";
  }

  # Fill up with dummy declarations for the remaining functions.
  api_functions('',' {}');

}

if ($opt{D}) {
  close APPLICATION || die "$0: Cannot close $APPLICATION: $!\n";
}
elsif ($opt{A}) {
  close SCANNER || die "$0: Cannot close $SCANNER: $!\n";
}


=pod

=head1 NAME

flexml - generate validating XML processor and applications from DTD

=head1 SYNOPSIS

B<flexml>
[B<-ASHDvdnLXV>]
[B<-s>I<skel>]
[B<-p>I<pubid>]
[B<-u>I<uri>]
[B<-r>I<rootags>]
[B<-a>I<actions>]
I<name>[F<.dtd>]

=head1 DESCRIPTION

I<Flexml> reads I<name>F<.dtd> which must be a DTD (Document Type
Definition) describing the format of XML (Extensible Markup Language)
documents, and produces a "validating" XML I<processor> with an
interface to support XML I<application>s.  Proper applications can be
generated optionally from special "action files", either for linking
or textual combination with the processor.

The generated processor will only validate documents that conform
strictly to the DTD, I<without extending it>, more precisely we in
practice restrict XML rule [28] to

  [28r] doctypedecl ::= '<!DOCTYPE' S Name S ExternalID S? '>'

where the C<ExternalId> denotes the used DTD.  (One might say, in
fact, that I<flexml> implements "non-extensible" markup. :)

The generated processor is a I<flex>(1) scanner, by default named
I<name>F<.l> with a corresponding C header file I<name>F<.h> for
separate compilation of generated applications.  Optionally I<flexml>
takes an I<actions> file with per-element actions and produces a C
file with element functions for an XML application with entry points
called from the XML processor (it can also fold the XML application
into the XML processor to make stand-alone XML applications but this
prevents sharing of the processor between applications).

In L</OPTIONS>Z<> we list the possible options, in L</ACTION FILE
FORMAT>Z<> we explain how to write applications, in L</COMPILATION> we
explain how to compile produced processors and applications into
executables, and in L</BUGS> we list the current limitations of the
system before giving standard references.

=head1 OPTIONS

I<Flexml> takes the following options.

=over 4

=item B<-A>

Generate a I<stand-alone> scanner application.  If combined with
B<-a>I<actions> then the application will be named as I<actions> with
the extension replaced by F<.l>, otherwise it will be in I<name>F<.l>.
Conflicts with B<-S>, B<-H>, and B<-D>.

=item B<-a> I<actions>

Uses the I<actions> file to produce an XML application in the file
with the same name as I<actions> after replacing the extension with
F<.c>.  If combined with B<-A> then instead the stand-alone
application will include the action functions.

=item B<-D>

Generate a dummy application I<name>F<-dummy.c> with just empty
functions to be called by the XML processor.  If combined with
B<-a>I<actions> then the application will insert the specified
actions and be named as I<actions> with the extension replaced by
F<.c>.  Conflicts with B<-A>; implied by B<-a> unless either of
B<-SHD> is specified.

=item B<-d>

Turns on debug mode in the flex scanner and also prints out the
details of the DTD analysis performed by I<flexml>.

=item B<-H>

Generate the header file I<name>F<.h>.  Conflicts with B<-A>; on by
default if none of B<-SHD> specified.

=item B<-L>

Makes the XML processor (as produced by I<flex>(1)) count the lines in
the input and keep it available to XML application actions in the
integer C<yylineno>.  (This is off by default as the performance
overhead is significant.)

=item B<-q>

Prevents the XML processor (as produced by I<flex>(1)) from reporting 
the error it runs into on stderr. Instead, users will have to pool for
error messages with the parse_err_msg() function.
By default, error messages are written on stderr.

=item B<-n>

"Dry-run": do not produce any of the output files.

=item B<-p> I<pubid>

Sets the document type to be C<PUBLIC> with the identifier I<pubid>
instead of C<SYSTEM>, the default.

=item B<-r> I<roottags>

Restricts the XML processor to validate only documents with one of the
root elements listed in the comma-separated I<roottags>.

=item B<-S>

Generate the scanner I<name>F<.l>.  Conflicts with B<-A>; on by
default if none of B<-SHD> specified.

=item B<-s> I<skel>

Use the skeleton scanner I<skel> instead of the default.

=item B<-b> I<stack_size>

Sets the FLEXML_BUFFERSTACKSIZE to stack_size (100000 by default). Use
this option when you get an error like "Assertion `next<limit'
failed".

=item B<-u> I<uri>

Sets the URI of the DTD, used in the C<DOCTYPE> header, to the
specified I<uri> (the default is the DTD name).

=item B<-v>

Be verbose: echo each DTD declaration (after parameter expansion).

=item B<-V>

Print the version of I<flexml> and exit.

=back

=head1 ACTION FILE FORMAT

Action files, passed to the B<-a> option, are XML documents conforming
to the DTD F<flexml-act.dtd> which is the following:

  <!ELEMENT actions ((top|start|end)*,main?)>
  <!ENTITY % C-code "(#PCDATA)">
  <!ELEMENT top   %C-code;>
  <!ELEMENT start %C-code;>  <!ATTLIST start tag NMTOKEN #REQUIRED>
  <!ELEMENT end   %C-code;>  <!ATTLIST end   tag NMTOKEN #REQUIRED>
  <!ELEMENT main  %C-code;>

The elements should be used as follows:

=over 4

=item C<top>

Use for top-level C code such as global declarations, utility
functions, etc.

=item C<start>

Attaches the code as an action to the element with the name of the
required "C<tag>" attribute.  The "C<%C-code;>" component should be C
code suitable for inclusion in a C block (i.e., within C<{>...C<}> so
it may contain local variables); furthermore the following extensions
are available:

C<{>I<attribute>C<}>: Can be used to access the value of the
I<attribute> as set with I<attribute>C<=>I<value> in the start tag.
In C, C<{>I<attribute>C<}> will be interpreted depending on the
declaration of the attribute. If the attribute is declared as an
enumerated type like

  <!ATTLIST attrib (alt1 | alt2 |...) ...>

then the C attribute value is of an enumerated type with the elements
written C<{>I<attribute>C<=>I<alt1>C<}>,
C<{>I<attribute>C<=>I<alt2>C<}>, etc.; furthermore an I<unset>
attribute has the "value" C<{!>I<attribute>C<}>.  If the attribute is
not an enumeration then C<{>I<attribute>C<}> is a null-terminated C
string (of type C<char*>) and C<{!>I<attribute>C<}> is C<NULL>.

=item C<end>

Similarly attaches the code as an action to the end tag with the name
of the required "C<tag>" attribute; also here the "C<%C-code;>"
component should be C code suitable for inclusion in a C block.  In
case the element has "Mixed" contents, i.e, was declared to permit
C<#PCDATA>, then the following variable is available:

C<{#PCDATA}>: Contains the text (C<#PCDATA>) of the element as a
null-terminated C string (of type C<char*>).  In case the Mixed
contents element actually mixed text and child elements then C<pcdata>
contains the plain concatenation of the text fragments as one string.

=item C<main>

Finally, an optional "C<main>" element can contain the C C<main>
function of the XML application.  Normally the C<main> function should
include (at least) one call of the XML processor:

C<yylex()>:
Invokes the XML processor produced by I<flex>(1) on the XML document
found on the standard input (actually the C<yyin> file handle: see the
manual for I<flex>(1) for information on how to change this as well as
the name C<yylex>).

If no C<main> action is provided then the following is used:

  int main() { exit(yylex()); }

=back

It is advisable to use XML E<lt>C<![CDATA[> ... C<]]>E<gt> sections
for the C code to make sure that all characters are properly passed to
the output file.

Finally note that I<Flexml> handles empty elements
E<lt>I<tag>C</>E<gt> as equivalent to
E<lt>I<tag>E<gt>E<lt>C</>I<tag>E<gt>.

=head1 COMPILATION

The following I<make>(1) file fragment shows how one can compile
I<flexml>-generated programs:


  # Programs.
  FLEXML = flexml -v

  # Generate linkable XML processor with header for application.
  %.l %.h: %.dtd
	  $(FLEXML) $<

  # Generate C source from flex scanner.
  %.c: 	  %.l
	  $(FLEX) -Bs -o"$@" "$<"

  # Generate XML application C source to link with processor.
  # Note: The dependency must be of the form "appl.c: appl.act proc.dtd".
  %.c:	  %.act
	  $(FLEXML) -D -a $^

  # Direct generation of stand-alone XML processor+application.
  # Note: The dependency must be of the form "appl.l: appl.act proc.dtd".
  %.l:	  %.act
	  $(FLEXML) -A -a $^

=head1 BUGS

The present version of I<flexml> is to be considered in "early beta"
state thus bugs should be expected (and the author would like to hear
about them).  Here are some known restrictions that we hope to
overcome in the future:

=over 4

=item *

The character set is merely ASCII (actually I<flex>(1) handles 8 bit
characters but only the ASCII character set is common with the XML
default UTF-8 encoding).

=item *

C<ID> type attributes are not validated for uniqueness; C<IDREF> and
C<IDREFS> attributes are not validated for existence.

=item *

The C<ENTITY> and C<ENTITIES> attribute types are not supported.

=item *

C<NOTATION> declarations are not supported.

=item *

The various C<xml:>-attributes are treated like any other attributes;
in particular C<xml:spaces> should be supported.

=item *

The XML processor currently uses a fixed-size buffer to read
C<pcdata>.  It should not.

=item *

The DTD parser is presently a perl hack so it may parse some DTDs
badly; in particular the expansion of parameter entities may not
conform fully to the XML specification.

=item *

A child should be able to "return" a value for the parent (also called
a I<synthesised attribute>).  Similarly an element in Mixed contents
should be able to inject text into the C<pcdata> of the parent.

=back

=head1 FILES

=over 4

=item F<./skel>

The skeleton scanner with the generic parts of XML scanning.

=item F</usr/share/doc/flexml/>

License, further documentation, and examples.

=back

=head1 SEE ALSO

I<flex>(1), Extensible Markup Language (XML) 1.0 (W3C Recommendation
REC-xml-1998-0210).

=head1 AUTHOR

I<Flexml> was written by Kristoffer Høgsbro Rose,
E<lt>C<krisrose@debian.org>E<gt>.

=head1 COPYRIGHT

The program is Copyright (c) 1999 Kristoffer Rose (all rights
reserved) and distributed under the GNU General Public License (GPL,
also known as "copyleft", which clarifies that the author provides
absolutely no warranty for I<flexml> and ensures that I<flexml> is and
will remain available for all uses, even comercial).

=head1 ACKNOWLEDGEMENT

I am grateful to NTSys (France) for supporting the development of
I<flexml>.  Finally extend my severe thanks to Jef Poskanzer, Vern
Paxson, and the rest of the I<flex> maintainers and GNU developers for
a great tool.

=cut