1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
|
module Ast = Ast
module Loc = Loc
module Warning = Warning
type t = {
ast : Ast.t;
warnings : Warning.t list;
reversed_newlines : (int * int) list;
original_pos : Lexing.position;
}
(* odoc uses an ocamllex lexer. The "engine" for such lexers is the standard
[Lexing] module.
As the [Lexing] module reads the input, it keeps track of only the byte
offset into the input. It is normally the job of each particular lexer
implementation to decide which character sequences count as newlines, and
keep track of line/column locations. This is usually done by writing several
extra regular expressions, and calling [Lexing.new_line] at the right time.
Keeping track of newlines like this makes the odoc lexer somewhat too
diffiult to read, however. To factor the aspect of keeping track of newlines
fully out of the odoc lexer, instead of having it keep track of newlines as
it's scanning the input, the input is pre-scanned before feeding it into the
lexer. A table of all the newlines is assembled, and used to convert offsets
into line/column pairs after the lexer emits tokens.
[reversed_newlines ~input ~comment_location offset] returns a list of pairs
of (line number * offset), allowing the easy conversion from the byte
[offset], relative to the beginning of a comment, into a location, relative
to the beginning of the file containing the comment. This can then be used
to convert from byte offset to line number / column number - a Loc.point,
and additionally for converting back from a Loc.point to a Lexing.position.
*)
let reversed_newlines : input:string -> (int * int) list =
fun ~input ->
let rec find_newlines line_number input_index newlines_accumulator =
if input_index >= String.length input then newlines_accumulator
else if
(* This is good enough to detect CR-LF also. *)
input.[input_index] = '\n'
then
find_newlines (line_number + 1) (input_index + 1)
((line_number + 1, input_index + 1) :: newlines_accumulator)
else find_newlines line_number (input_index + 1) newlines_accumulator
in
find_newlines 1 0 [ (1, 0) ]
(* [offset_to_location] converts from an offset within the comment text, where
[reversed_newlines] is the result of the above function and [comment_location]
is the location of the comment within its file. The function is meant to be
partially applied to its first two arguments, at which point it is passed to
the lexer, so it can apply the table to its emitted tokens. *)
let offset_to_location :
reversed_newlines:(int * int) list ->
comment_location:Lexing.position ->
int ->
Loc.point =
fun ~reversed_newlines ~comment_location byte_offset ->
let rec scan_to_last_newline reversed_newlines_prefix =
match reversed_newlines_prefix with
| [] -> assert false
| (line_in_comment, line_start_offset) :: prefix ->
if line_start_offset > byte_offset then scan_to_last_newline prefix
else
let column_in_comment = byte_offset - line_start_offset in
let line_in_file =
line_in_comment + comment_location.Lexing.pos_lnum - 1
in
let column_in_file =
if line_in_comment = 1 then
column_in_comment + comment_location.Lexing.pos_cnum
- comment_location.Lexing.pos_bol
else column_in_comment
in
{ Loc.line = line_in_file; column = column_in_file }
in
scan_to_last_newline reversed_newlines
(* Given a Loc.point and the result of [parse_comment], this function returns
a valid Lexing.position *)
let position_of_point : t -> Loc.point -> Lexing.position =
fun v point ->
let { reversed_newlines; original_pos; _ } = v in
let line_in_comment = point.Loc.line - original_pos.pos_lnum + 1 in
let rec find_pos_bol reversed_newlines_prefix =
match reversed_newlines_prefix with
| [] -> assert false
| [ _ ] -> original_pos.pos_bol
| (line_number, line_start_offset) :: prefix ->
if line_number > line_in_comment then find_pos_bol prefix
else line_start_offset + original_pos.pos_cnum
in
let pos_bol = find_pos_bol reversed_newlines in
let pos_lnum = point.Loc.line in
let pos_cnum = point.column + pos_bol in
let pos_fname = original_pos.pos_fname in
{ Lexing.pos_bol; pos_lnum; pos_cnum; pos_fname }
(* The main entry point for this module *)
let parse_comment ~location ~text =
let warnings = ref [] in
let reversed_newlines = reversed_newlines ~input:text in
let string_buffer = Buffer.create 256 in
let token_stream =
let lexbuf = Lexing.from_string text in
let offset_to_location =
offset_to_location ~reversed_newlines ~comment_location:location
in
let input : Lexer.input =
{
file = location.Lexing.pos_fname;
offset_to_location;
warnings;
lexbuf;
string_buffer;
}
in
Stream.from (fun _token_index -> Some (Lexer.token input lexbuf))
in
let ast, warnings = Syntax.parse warnings token_stream in
{ ast; warnings; reversed_newlines; original_pos = location }
(* Accessor functions, as [t] is opaque *)
let warnings t = t.warnings
let ast t = t.ast
(** [deindent ~what input ~start_offset s] "deindents" [s] by an offset computed
from [start_offset] and [input], corresponding to the begining of a code
block or verbatim. If that is not possible (eg there is a non-whitespace
line starting with less than [offset] whitespaces), it unindents as much as
possible and raises a warning. *)
let deindent : what:string -> loc:Loc.span -> string -> string * Warning.t list
=
fun ~what ~loc s ->
let offset = loc.start.column in
(* Whitespace-only lines do not count, so they return [None]. *)
let count_leading_whitespace line =
let rec count_leading_whitespace' index len =
if index = len then None
else
match line.[index] with
| ' ' | '\t' -> count_leading_whitespace' (index + 1) len
| _ -> Some index
in
let len = String.length line in
(* '\r' may remain because we only split on '\n' below. This is important
for the first line, which would be considered not empty without this check. *)
let len = if len > 0 && line.[len - 1] = '\r' then len - 1 else len in
count_leading_whitespace' 0 len
in
let lines = Astring.String.cuts ~sep:"\n" s in
let least_amount_of_whitespace =
List.fold_left
(fun least_so_far line ->
match (count_leading_whitespace line, least_so_far) with
| Some n, least when n < least -> n
| _ -> least_so_far)
offset lines
in
let warning =
if least_amount_of_whitespace < offset then
[ Parse_error.not_enough_indentation_in_code_block ~what loc ]
else []
in
let drop n line =
(* Since blank lines were ignored when calculating
[least_amount_of_whitespace], their length might be less than the
amount. *)
if String.length line < n then ""
else String.sub line n (String.length line - n)
in
let lines = List.map (drop least_amount_of_whitespace) lines in
(String.concat "\n" lines, warning)
(** Implements the rules for code block as specified in [odoc_for_authors],
section on code blocks and indentation. *)
let code_block_content ~what ~loc s =
let indent = loc.Loc.start.column in
(* Remove the first line (to first \n char, included) if it's whitespace only.
Otherwise, indent at [indent] level to account for offset. *)
let rec handle_first_newline index =
if index >= String.length s then String.make indent ' ' ^ s
else
match s.[index] with
| ' ' | '\t' | '\r' -> handle_first_newline (index + 1)
| '\n' -> String.sub s (index + 1) (String.length s - index - 1)
| _ -> String.make indent ' ' ^ s
in
let s = handle_first_newline 0 in
(* Remove the last line (from last \n char, included) if it's whitespace
only. *)
let rec handle_last_newline index =
if index < 0 then s
else
match s.[index] with
| ' ' | '\t' | '\r' -> handle_last_newline (index - 1)
| '\n' -> String.sub s 0 index
| _ -> s
in
let s = handle_last_newline (String.length s - 1) in
deindent ~what ~loc s
let verbatim_content loc c =
let what = "verbatim" in
code_block_content ~what ~loc c
let codeblock_content loc c =
let what = "code block" in
code_block_content ~what ~loc c
|