File: sourcemap.sml

package info (click to toggle)
smlnj 110.79-8
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, sid
  • size: 82,564 kB
  • sloc: ansic: 32,532; asm: 6,314; sh: 2,296; makefile: 1,821; perl: 1,170; pascal: 295; yacc: 190; cs: 78; python: 77; lisp: 19
file content (268 lines) | stat: -rw-r--r-- 11,979 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
(* sourcemap.sml
 *
 * COPYRIGHT (c) 2012 The Fellowship of SML/NJ (http://www.smlnj.org)
 * All rights reserved.
 *)

(* Norman Ramsey: *)
(* I can imagine at least three implementations: one that doesn't
 * support resynchronization, one that supports resynchronization only at
 * column 1, and one that supports arbitrary resynchronization.
 *
 * This implementation supports arbitary resynchronization.
 *
 * Changed ErrorMsg to use SourceMap to get source locations; only the
 * formatting is done internally.
 *
 * Added SourceMap structure.
 *)

(* DBM: what is "resynchronization" and what is it used for?  Is there any
 * reason to continue to support it (and maintain the extra code complexity)?
 * If this was a feature used only by Ramsey's noweb utility, which is defunct,
 * then we could simplify the sourcemap code.  -- John claims that resynchronization
 * is still relevant (examples?). *)

(* DBM: "Resynchonization" supports a model where the input stream for a compilation
 * unit is made up of multiple source files.  These may be combined either by
 * concatenation or inserting one file in the middle of another (like #include
 * in cpp).  We'll call the pieces of source that are being combined "file segments"
 * or "segments" for short.
 *
 * We'll assume that minimal granularity for resynchonization is a 
 * source line (i.e. no switching files in the middle of a line).
 * The boundaries between source file segments will be marked by #line
 * commands embedded in comments.  These have the form '(*#line nn "filename"*)'
 * or '(*#line nn*)' where nn is a new line number at which the next segment
 * starts, and filename is the name of the file for the next segment. It is
 * assumed that the #line command comment will appear on a line by itself
 * (presumably inserted by an external preprocessor that is responsible for
 * combining source segments to form the input stream).  These #line commands
 * are recognized and interpreted by the lexer, which calls resynch to change
 * the state of the current sourcemap.
 *
 * QUESTION: Can a source region, which designates a contiguous region in the
 * input stream, cross one or more file segments?  Or should all regions created
 * during parsing and used in elaboration be within a single segment (and therefore
 * be associated with a single source file)? The type and implementation of
 * function fileregion imply that a region can span multiple segments.
 * 
 * QUESTION: Presumably, the original motivation for adding this feature was to support
 * Norman Ramsey's nw "literate programming" system, which we no longer support.
 * What new clients use this functionality?
 *
 * Obviously, the implementation of sourcemap could be made much simpler without
 * resynchronization.
 *
 * New functionality for mapping regions to source strings (for enhanced type error
 * messages) is currently incompatible with resynchronization.
 *
 *) 

structure SourceMap :> SOURCE_MAP =
struct

  (* compiler bug errors *)
  exception SourceMap of string
  fun bug msg = raise SourceMap(msg)

  (* types ------------------------------- *)

  (* A character position is an integer.  A region is delimited by the
   * position of the start character and one beyond the end.
   * It might help to think of Icon-style positions, which fall between
   * characters.
   *)

  type charpos = int
    (* charpos is 1-based. I.e. the (default) position of the first character in the
     * input stream is 1 (????) *)

  type region = charpos * charpos
    (* INVARIANT: (lo,hi) : region ==> lo <= hi 
     * If region /= (0,0), then lo < hi, i.e. all non-null regions are nonempty. *)

  type sourceloc = {fileName:string, line:int, column:int}
    (* lines and columns are 1-based (minimum value is 1) *)

(* The representation of a sourcemap is a pair of lists.
     lines: line numbers for newlines and resynchronizations,
            labeled by initial charpos of each lines.
     files: file name for resynchronization, labeled by
            initial position for resynchronization

   The representation satisfies these invariants:
     * The lists are never empty (initialization is treated as a resynchronization).
     * Initial positions strictly decrease as we traverse the line list.
     * The last element in the line list contains the smallest valid position (1).
     * For every element in files, there is a corresponding SYNC element in
       lines, and visa versa.
*)

  (* line -- elements of lines list *)
  datatype line
    = LINE of int         (* line number, simple line bump *)
    | SYNC of int * int * int
       (* resynch point with line, column, and the size of the #line directive gap;
        * there will be an associated entry in files list, which MAY change
        * the current file name, but may be the same as the previous file name
        * if the #line directive does not specify a file name. *)

  type sourcemap = {lines: (charpos * line) list ref,
		    files: string list ref}
  (* INVARIANTS for sourcemaps:
   * (1) length (!lines) > 0
   * (2) length (!files) > 0
   * (3) charpos components of lines are strictly decreasing (ending in 1)
   * (4) length (!files) = number of SYNC elements in lines
   * (5) last (initial) element of lines is the SYNC line: (1, SYNC(1,1,0))
   *)

  val nullRegion : region = (0,0)
  (* nullRegion is a conventional default region value.  It does not represent
   * a proper region, and does not have a location in the file. In particular, it
   * should not be viewed as an empty region at the beginning of the input. *)

  (* newSourceMap: create a new sourcemap, given initial file name.
   * called only one place, in Source.newSource.  Initial position at the
   * start of the first line is 1, initial line number is 1. *)
  fun newSourceMap (fileName: string) : sourcemap =
      {files = ref [fileName],
       lines = ref [(1, SYNC(1,1,0))]}

  (* resynch: implements a #line directive, changing the current filename, line and column.
   * initpos is the position of the initial character of the #line comment
   * newpos is the character immediately following the end of the #line comment
   * ASSUMPTION: newpos > last line position in the sourcemap argument *)
  fun resynch ({files, lines}: sourcemap) (initpos, newpos, line, column, fileNameOp) =
      let val newFileName =
              case fileNameOp
                of SOME f => f
                 | NONE => hd (!files)   (* same as the current file name *)
       in files := newFileName :: !files;
	  lines := (newpos, SYNC(line,column,newpos-initpos)) :: !lines
      end

  fun lineNo (LINE l | SYNC(l,_,_)) = l

  (* Since pos is the position of the newline character, the next line doesn't
   * start until the succeeding position, pos+1. *)
  fun newline ({lines, ...}: sourcemap) pos =
      case !lines
        of (_,line) :: _ =>  lines := (pos+1, LINE(lineNo(line)+1)) :: !lines
         | nil => bug "newline"  (* invariant (1) violated *)

  fun lastLinePos ({lines, ...}: sourcemap) : charpos =
      case !lines
        of ((pos,line)::_) => pos
         | nil => bug "lastLineNumber" (* invariant (1) violated *)

  (* remove: remove from sourcemap lines those lines whose initial positions
   * exceed a target position, while maintaining the lines/files invariants.
   * The first line of the result will contain the target position.
   * ASSUMPTION: pos is >= initial pos of the sourcemap (normally 1). *)
  fun remove pos (lines: (charpos * line) list, files: string list) = 
      let fun strip (lines as (pos', line)::lines', files as (_ :: files')) = 
              if pos' > pos then 
                 (case line
                    of LINE _ => strip (lines', files)
		     | SYNC _ => strip (lines', files')) 
              else (lines, files)
	    | strip _ = bug "remove"
       in strip(lines, files)
      end

  (* ASSUMPTION: pos lies within the given line: 
   *   lineStart <= pos < start of next line  *)
  fun column ((lineStart, line), pos) =
      let val col = case line 
		      of LINE _  => 1
		       | SYNC(_,c,_) => c
       in pos - lineStart + col
      end

  fun filepos ({lines,files}:sourcemap) pos : sourceloc =
      case remove pos (!lines,!files)
        of ((linePos,line)::_, file::_) =>   (* pos is within top line *)
           {fileName = file, line = lineNo line, column = column((linePos,line), pos)}
         | _ => bug "filepos"

  (* Searching regions is a bit trickier, since we track file and line
   * simultaneously.  We exploit the invariant that every file entry has a
   * corresponding line entry.  We also exploit that only file entries
   * correspond to new regions. *)

  fun isNullRegion (0,0) = true
    | isNullRegion _ = false

  fun fileregion ({lines,files}: sourcemap) ((lo, hi): region) =
      if isNullRegion(lo,hi) then [] else
      let fun posToSourceLoc(pos, (linePos, line)::_,  file::_): sourceloc = 
		 {fileName=file, line=lineNo(line), column=column((linePos,line), pos)}
	    | posToSourceLoc _ = bug "posToSourceLoc"
	  
	  fun gather((linePos, line)::lines', files as file::files',
		     segment_end, answers) =
	       if linePos <= lo then (* last item *)
		 ({fileName=file, line=lineNo(line), column=column((linePos,line),lo)},
		  segment_end)
                 :: answers
	       else (case line
                       of LINE _ => gather(lines', files, segment_end, answers)
                        | SYNC(l,c,g) => (* crossing segment boundary *)
			  let val endpos =
				  (case lines'
				    of (linePos', _)::_ =>
                                       if linePos - g = linePos' then linePos' - 1
                                       else linePos - g)
                          in gather(lines', files',
				    posToSourceLoc (endpos, lines', files'),
				    ({fileName = file, line = l, column = c},
				     segment_end) :: answers)
			  end)
	    | gather _ = bug "fileregion"
	  val (lines0, files0) = remove hi (!lines,!files)
       in gather(lines0, files0, posToSourceLoc(hi,lines0,files0), [])
      end

   (* newlineCount : sourcemap -> region -> int 
    * determines the (approximate) number of newlines occurring in a region,
    * which may be 0 for a region that lies within a single line. Any lines
    * containing #line directives (i.e. SYNC lines) are not counted. *)
   fun newlineCount ({lines,files}: sourcemap) ((lo, hi): region) =
       let val his as (hilines, hifiles) = remove hi (!lines,!files)
	   val (lolines, lofiles) = remove lo his
	in (length hilines - length lolines) - (length hifiles - length lofiles)
       end

   (* removeLines - remove lines, stoping when the next line contains pos *)
   fun removeLines (pos, (lines: (charpos * line) list)) =
       let fun strip (lines as (pos1, _)::lines') =
	       if pos1 > pos then strip (lines') else lines
	     | strip _ = bug "removeLines"
	in strip lines
       end

   (* removeABO - remove all-but-one.
    * removes lines until pos is in the next to last line *)
   fun removeABO (pos, (line::lines: (charpos * line) list)) =
       let fun strip (lines as (pos1, line1)::lines', (pos0, line0)) =
	       if pos1 > pos then strip (lines', (pos1, line1))
	       else (pos0,line0)::lines
	     | strip _ = bug "removeABO"
	in strip(lines, line)
       end

   (* widenToLines - take a region and expand it to the beginning, respectively end,
    * of the first and last lines intersecting the region. This works only
    * for nonsegmented inputs (no noninitial SYNCs). Also assumes that the
    * region hi limit comes before the last newline in the input, which 
    * should be the case if the input ends with a newline. *)
   fun widenToLines ({lines,files}: sourcemap) ((lo, hi) : region) =
       if isNullRegion (lo,hi) then nullRegion
       else let val (lines1 as (pos1,_)::lines1') = removeABO(hi, !lines)
                val (pos2,_)::_ = removeLines(lo, lines1')
             in (pos2, pos1-1)
	    end

end (* structure SourceMap *)