1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
|
(** Lexer for .obuild files
Tokenizes the line-based, indentation-sensitive format. *)
open Location
(** Token types *)
type token =
| KEY_VALUE of string * string (* key: value or key = value *)
| BLOCK of string * string list (* blockname arg1 arg2 ... *)
| BLANK (* empty or comment line *)
| EOF
type located_token = {
tok : token;
loc : Location.loc;
indent : int;
}
(** A token with its location and indentation level *)
let new_located_token t l i = {tok = t; loc = l; indent = i}
exception Lexer_error of loc * string
(** Lexer error *)
(** Check if a character is whitespace (space or tab) *)
let is_whitespace c = c = ' ' || c = '\t'
(** Count leading whitespace and return (indent_level, rest_of_string) *)
let count_indent s =
let len = String.length s in
let rec loop i =
if i >= len then
(i, "")
else if is_whitespace s.[i] then
loop (i + 1)
else
(i, String.sub s i (len - i))
in
loop 0
(** Strip trailing whitespace *)
let strip_trailing s =
let len = String.length s in
let rec loop i =
if i <= 0 then
""
else if is_whitespace s.[i - 1] then
loop (i - 1)
else
String.sub s 0 i
in
loop len
(** Strip leading and trailing whitespace *)
let strip s =
let _, rest = count_indent s in
strip_trailing rest
(** Check if line is blank or a comment *)
let is_blank_or_comment s =
let s = strip s in
s = "" || (String.length s > 0 && s.[0] = '#')
(** Find separator (: or =) and split into key/value *)
let find_key_value s =
let len = String.length s in
let rec loop i =
if i >= len then
None
else
match
s.[i]
with
| ':' | '=' ->
let key = strip (String.sub s 0 i) in
let value = strip (String.sub s (i + 1) (len - i - 1)) in
Some (key, value)
| _ -> loop (i + 1)
in
loop 0
(** Split string into words *)
let split_words s =
let len = String.length s in
let rec skip_ws i =
if i >= len then
i
else if is_whitespace s.[i] then
skip_ws (i + 1)
else
i
in
let rec read_word i acc =
if i >= len then
(i, acc)
else if is_whitespace s.[i] then
(i, acc)
else
read_word (i + 1) (acc ^ String.make 1 s.[i])
in
let rec loop i words =
let i = skip_ws i in
if i >= len then
List.rev words
else
let i', word = read_word i "" in
loop i' (word :: words)
in
loop 0 []
(** Tokenize a single line *)
let tokenize_line line_num line =
let indent, content = count_indent line in
let loc = new_location line_num (indent + 1) in
if is_blank_or_comment line then
{ tok = BLANK; loc; indent }
else
match
find_key_value content
with
| Some (key, value) -> { tok = KEY_VALUE (key, value); loc; indent }
| None -> (
(* No separator - must be a block header *)
let words = split_words content in
match words with
| [] -> { tok = BLANK; loc; indent }
| keyword :: args -> { tok = BLOCK (keyword, args); loc; indent })
(** Merge BLOCK tokens that are continuations of a KEY_VALUE into it.
A BLOCK token is a continuation when:
- It immediately follows a KEY_VALUE token (with no intervening tokens)
- Its indentation is strictly greater than the KEY_VALUE's indentation
This handles multi-line field values like:
modules: A, B,
C, D <- tokenized as BLOCK("C,", ["D"]) but is really a continuation
*)
let merge_continuations tokens =
let rec loop = function
| [] -> []
| ({ tok = KEY_VALUE (k, v); indent = kv_indent; _ } as kv_tok) :: rest ->
(* Greedily collect BLOCK tokens at strictly higher indentation *)
let rec collect v = function
| ({ tok = BLOCK (name, args); indent; _ }) :: more when indent > kv_indent ->
let cont = String.concat " " (name :: args) in
let v' = if v = "" then cont else v ^ " " ^ cont in
collect v' more
| remaining -> (v, remaining)
in
let full_v, remaining = collect v rest in
{ kv_tok with tok = KEY_VALUE (k, full_v) } :: loop remaining
| t :: rest -> t :: loop rest
in
loop tokens
(** Tokenize entire input string *)
let tokenize input =
let lines = String_utils.split '\n' input in
let rec loop line_num acc = function
| [] ->
let loc = { line = line_num; col = 1 } in
List.rev ({ tok = EOF; loc; indent = 0 } :: acc)
| line :: rest ->
let token = tokenize_line line_num line in
(* Skip blank lines in token stream *)
let acc' = if token.tok = BLANK then acc else token :: acc in
loop (line_num + 1) acc' rest
in
let raw = loop 1 [] lines in
merge_continuations raw
(** Tokenize from a file *)
let tokenize_file path =
let ic = open_in path in
let n = in_channel_length ic in
let buf = Compat.bytes_create n in
really_input ic buf 0 n;
let s = Compat.bytes_to_string buf in
close_in ic;
tokenize s
(** Pretty-print a token for debugging *)
let token_to_string = function
| KEY_VALUE (k, v) -> Printf.sprintf "KEY_VALUE(%s, %s)" k v
| BLOCK (name, args) -> Printf.sprintf "BLOCK(%s, [%s])" name (String.concat "; " args)
| BLANK -> "BLANK"
| EOF -> "EOF"
let located_token_to_string t =
Printf.sprintf "%d:%d indent=%d %s" t.loc.line t.loc.col t.indent (token_to_string t.tok)
|