1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
|
(* A simple lexer, which distinguishes integers, floats and single character
delimiters. Quoted strings are also distinguished, and allow escaped quotes.
Any other non-whitespace-including string is returned as an [Ident]. *)
open Pdfutil
(* To avoid too much storage allocation (and hence garbage collection), we use
the same data type for this very basic lexing module as for the main lexing in
Pdfread. Eventually, we may unify this with the parsing type too. *)
type t =
| LexNull
| LexBool of bool
| LexInt of int
| LexReal of float
| LexString of string
| LexName of string
| LexLeftSquare
| LexRightSquare
| LexLeftDict
| LexRightDict
| LexStream of Pdf.stream
| LexEndStream
| LexObj
| LexEndObj
| LexR
| LexComment of string
| StopLexing
| LexNone
let string_of_token = function
| LexInt i -> "Int " ^ string_of_int i
| LexReal f -> "Float " ^ string_of_float f
| LexString s -> "String " ^ s
| LexName s -> "Ident " ^ s
| LexNull -> "Nothing"
| _ -> "GenLexNone"
let string_of_tokens ts =
fold_left (fun a b -> a ^ "\n " ^ b) "" (map string_of_token ts)
let is_delimiter = function
| '(' | ')' | '<' | '>' | '[' | ']' | '{' | '}' | '%' | '/' -> true
| _ -> false
let is_not_whitespace = function
| '\000' | '\009' | '\010' | '\012' | ' ' | '\013' -> false
| _ -> true
let is_whitespace_or_delimiter = function
| '\000' | '\009' | '\010' | '\012' | ' ' | '\013'
| '(' | ')' | '<' | '>' | '[' | ']' | '{' | '}' | '%' | '/' -> true
| _ -> false
(* Because String.copy has been removed from OCaml. *)
let string_copy s =
Bytes.unsafe_to_string (Bytes.copy (Bytes.unsafe_of_string s))
let lex_item s =
let len = String.length s in
if len = 0 then LexNull else
try
match String.unsafe_get s 0 with
| 'a'..'z' | 'A'..'Z' ->
LexName (string_copy s)
| '\"' when len >= 2 ->
LexString (String.sub s 1 (len - 2))
| _ ->
let rec isint s pos =
pos = ~-1 ||
match String.unsafe_get s pos with
| '.' -> false
| _ -> isint s (pos - 1)
in
if isint s (len - 1)
then
begin try LexInt (int_of_string s) with
_ ->
begin try
(* Detect malformed numbers "--2" etc. which can appear in some PDFs. *)
if len > 1 && String.unsafe_get s 0 = '-' && String.unsafe_get s 1 = '-' then
LexInt (int_of_string (String.sub s 1 (len - 1)))
else
raise Exit (* nothing we can salvage *)
with
_ -> LexReal (float_of_string s) (* Integer > 2^30 on 32 bit system, int_of_string would fail. *)
end
end
else
begin try LexReal (float_of_string s) with
_ ->
(* Detect malformed numbers "--2.5" etc. which can appear in some PDFs. *)
if len > 1 && String.unsafe_get s 0 = '-' && String.unsafe_get s 1 = '-' then
LexReal (float_of_string (String.sub s 1 (len - 1)))
else
raise Exit (* nothing we can salvage *)
end
with
_ -> LexName (string_copy s)
(* Return the string between and including the current position and before the
next character satisfying a given predicate, leaving the position at the
character following the last one returned. End of input is considered a
delimiter, and the characters up to it are returned if it is reached. *)
let rec lengthuntil i n =
match i.Pdfio.input_byte () with
| x when x = Pdfio.no_more -> n
| x ->
if is_whitespace_or_delimiter (Char.unsafe_chr x)
then n
else lengthuntil i (n + 1)
(* Pre-built strings to prevent allocation just to do int_of_string,
float_of_string etc. What we actually need is int_of_substring etc, but this
will require patching OCaml. *)
let strings =
Array.init 17 (fun i -> Bytes.make i ' ')
let getuntil i =
let p = i.Pdfio.pos_in () in
let l = lengthuntil i 0 in
i.Pdfio.seek_in p;
let s = if l <= 16 then Array.unsafe_get strings l else Bytes.create l in
Pdfio.setinit_string i s 0 l;
Bytes.unsafe_to_string s (* Will never be altered, but copied or discarded by get_string_inner. *)
(* The same, but don't return anything. *)
let rec ignoreuntil f i =
match i.Pdfio.input_byte () with
| x when x = Pdfio.no_more -> ()
| x -> if f (Char.unsafe_chr x) then Pdfio.rewind i else ignoreuntil f i
(* Position on the next non-whitespace character. *)
let dropwhite i =
ignoreuntil is_not_whitespace i
(* Get a quoted string, including the quotes. Any quotes inside must be
escaped. *)
let rec get_string_inner b i =
match i.Pdfio.input_byte () with
| x when x = Pdfio.no_more -> raise End_of_file
| x when x = int_of_char '\"' ->
Buffer.add_char b '\"'
| x when x = int_of_char '\\' ->
begin match i.Pdfio.input_byte () with
| x when x = Pdfio.no_more-> raise End_of_file
| x when x = int_of_char '\"' ->
Buffer.add_char b '\"';
get_string_inner b i
| x ->
Buffer.add_char b '\\';
Buffer.add_char b (Char.unsafe_chr x);
get_string_inner b i
end
| x ->
Buffer.add_char b (Char.unsafe_chr x);
get_string_inner b i
let b = Buffer.create 30
let get_string i =
Pdfio.nudge i;
Buffer.clear b;
Buffer.add_char b '\"';
get_string_inner b i;
Buffer.contents b
(* Repeatedly take a whitespace-or-delimiter-delimited section from the input,
and scan it *)
let get_section i =
match Pdfio.peek_byte i with
| x when x = Pdfio.no_more -> ""
| _ ->
dropwhite i;
match Pdfio.peek_byte i with
| x when x = Pdfio.no_more -> ""
| x when Char.unsafe_chr x = '\"' -> get_string i
| x ->
let x = Char.unsafe_chr x in
if is_delimiter x
then (Pdfio.nudge i; string_of_char x)
else getuntil i
let lex_single i =
lex_item (get_section i)
let rec lex_inner prev i =
match lex_item (get_section i) with
| LexNull -> rev prev
| x -> lex_inner (x::prev) i
let lex = lex_inner []
let lex_string s =
lex (Pdfio.input_of_string s)
|