1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445
|
(* csv.ml - comma separated values parser
*
* $Id: csv.ml,v 1.14 2006/11/24 15:49:24 rich Exp $
*)
(* The format of CSV files:
*
* Each field starts with either a double quote char or some other
* char. For the some other char case things are simple: just read up
* to the next comma (,) which marks the end of the field.
*
* In the case where a field begins with a double quote char the
* parsing rules are different. Any double quotes are doubled ("") and
* we finish reading when we reach an undoubled quote. eg: "The
* following is a quote: "", and that's all" is the CSV equivalent of
* the following literal field: The following is a quote: ", and that's
* all
*
* "0 is the quoted form of ASCII NUL.
*
* CSV fields can also contain literal carriage return characters, if
* they are quoted, eg: "This field
* is split over lines" represents a
* single field containing a \n.
*
* Excel will only use the quoting format if a field contains a double
* quote or comma, although there's no reason why Excel couldn't always
* use the quoted format.
*
* The practical upshot of this is that you can't split a line in a CSV
* file just by looking at the commas. You need to parse each field
* separately.
*
* How we represent CSV files:
*
* We load in the whole CSV file at once, and store it internally as a
* 'string list list' type (note that each line in the CSV file can,
* and often will, have different lengths). We then provide simple
* functions to read the CSV file line-by-line, copy it out, or copy a
* subset of it into a matrix.
*)
open Printf
(* Uncomment the next line to enable Extlib's List function. These
* avoid stack overflows on really huge CSV files.
*)
(*open ExtList*)
type t = string list list
exception Bad_CSV_file of string
let rec dropwhile f = function
| [] -> []
| x :: xs when f x -> dropwhile f xs
| xs -> xs
(* from extlib: *)
let rec drop n = function
| _ :: l when n > 0 -> drop (n-1) l
| l -> l
let rec take n = function
| x :: xs when n > 0 -> x :: take (pred n) xs
| _ -> []
let lines = List.length
let columns csv =
List.fold_left max 0 (List.map List.length csv)
type state_t = StartField
| InUnquotedField
| InQuotedField
| InQuotedFieldAfterQuote
let load_rows ?(separator = ',') f chan =
let row = ref [] in (* Current row. *)
let field = ref [] in (* Current field. *)
let state = ref StartField in (* Current state. *)
let end_of_field () =
let field_list = List.rev !field in
let field_len = List.length field_list in
let field_str = String.create field_len in
let rec loop i = function
[] -> ()
| x :: xs ->
field_str.[i] <- x;
loop (i+1) xs
in
loop 0 field_list;
row := field_str :: !row;
field := [];
state := StartField
in
let empty_field () =
row := "" :: !row;
field := [];
state := StartField
in
let end_of_row () =
let row_list = List.rev !row in
f row_list;
row := [];
state := StartField
in
let rec loop () =
let c = input_char chan in
if c != '\r' then ( (* Always ignore \r characters. *)
match !state with
StartField -> (* Expecting quote or other char. *)
if c = '"' then (
state := InQuotedField;
field := []
) else if c = separator then (* Empty field. *)
empty_field ()
else if c = '\n' then ( (* Empty field, end of row. *)
empty_field ();
end_of_row ()
) else (
state := InUnquotedField;
field := [c]
)
| InUnquotedField -> (* Reading chars to end of field. *)
if c = separator then (* End of field. *)
end_of_field ()
else if c = '\n' then ( (* End of field and end of row. *)
end_of_field ();
end_of_row ()
) else
field := c :: !field
| InQuotedField -> (* Reading chars to end of field. *)
if c = '"' then
state := InQuotedFieldAfterQuote
else
field := c :: !field
| InQuotedFieldAfterQuote ->
if c = '"' then ( (* Doubled quote. *)
field := c :: !field;
state := InQuotedField
) else if c = '0' then ( (* Quote-0 is ASCII NUL. *)
field := '\000' :: !field;
state := InQuotedField
) else if c = separator then (* End of field. *)
end_of_field ()
else if c = '\n' then ( (* End of field and end of row. *)
end_of_field ();
end_of_row ()
) else ( (* Bad single quote in field. *)
field := c :: '"' :: !field;
state := InQuotedField
)
); (* end of match *)
loop ()
in
try
loop ()
with
End_of_file ->
(* Any part left to write out? *)
(match !state with
StartField ->
if !row <> [] then
( empty_field (); end_of_row () )
| InUnquotedField | InQuotedFieldAfterQuote ->
end_of_field (); end_of_row ()
| InQuotedField ->
raise (Bad_CSV_file "Missing end quote after quoted field.")
)
let load_in ?separator chan =
let csv = ref [] in
let f row =
csv := row :: !csv
in
load_rows ?separator f chan;
List.rev !csv
let load ?separator filename =
let chan, close =
match filename with
| "-" -> stdin, false
| filename -> open_in filename, true in
let csv = load_in ?separator chan in
if close then close_in chan;
csv
let trim ?(top=true) ?(left=true) ?(right=true) ?(bottom=true) csv =
let rec empty_row = function
| [] -> true
| x :: xs when x <> "" -> false
| x :: xs -> empty_row xs
in
let csv = if top then dropwhile empty_row csv else csv in
let csv =
if right then
List.map (fun row ->
let row = List.rev row in
let row = dropwhile ((=) "") row in
let row = List.rev row in
row) csv
else csv in
let csv =
if bottom then (
let csv = List.rev csv in
let csv = dropwhile empty_row csv in
let csv = List.rev csv in
csv
) else csv in
let empty_left_cell =
function [] -> true | x :: xs when x = "" -> true | _ -> false in
let empty_left_col =
List.fold_left (fun a row -> a && empty_left_cell row) true in
let remove_left_col =
List.map (function [] -> [] | x :: xs -> xs) in
let rec loop csv =
if empty_left_col csv then (
let csv = remove_left_col csv in
loop csv
) else csv
in
let csv = if left then loop csv else csv in
csv
let square csv =
let columns = columns csv in
List.map (
fun row ->
let n = List.length row in
let row = List.rev row in
let rec loop acc = function
| 0 -> acc
| i -> "" :: loop acc (i-1)
in
let row = loop row (columns - n) in
List.rev row
) csv
let is_square csv =
let columns = columns csv in
List.for_all (fun row -> List.length row = columns) csv
let rec set_columns cols = function
| [] -> []
| r :: rs ->
let rec loop i cells =
if i < cols then (
match cells with
| [] -> "" :: loop (succ i) []
| c :: cs -> c :: loop (succ i) cs
)
else []
in
loop 0 r :: set_columns cols rs
let rec set_rows rows csv =
if rows > 0 then (
match csv with
| [] -> [] :: set_rows (pred rows) []
| r :: rs -> r :: set_rows (pred rows) rs
)
else []
let set_size rows cols csv =
set_columns cols (set_rows rows csv)
let sub r c rows cols csv =
let csv = drop r csv in
let csv = List.map (drop c) csv in
let csv = set_rows rows csv in
let csv = set_columns cols csv in
csv
(* Compare two rows for semantic equality - ignoring any blank cells
* at the end of each row.
*)
let rec compare_row (row1 : string list) row2 =
match row1, row2 with
| [], [] -> 0
| x :: xs, y :: ys ->
let c = compare x y in
if c <> 0 then c else compare_row xs ys
| "" :: xs , [] ->
compare_row xs []
| x :: xs, [] ->
1
| [], "" :: ys ->
compare_row [] ys
| [], y :: ys ->
-1
(* Semantic equality for CSV files. *)
let rec compare (csv1 : t) csv2 =
match csv1, csv2 with
| [], [] -> 0
| x :: xs, y :: ys ->
let c = compare_row x y in
if c <> 0 then c else compare xs ys
| x :: xs, [] ->
let c = compare_row x [] in
if c <> 0 then c else compare xs []
| [], y :: ys ->
let c = compare_row [] y in
if c <> 0 then c else compare [] ys
(* Concatenate - arrange left to right. *)
let rec concat = function
| [] -> []
| [csv] -> csv
| left_csv :: csvs ->
(* Concatenate the remaining CSV files. *)
let right_csv = concat csvs in
(* Set the height of the left and right CSVs to the same. *)
let nr_rows = max (lines left_csv) (lines right_csv) in
let left_csv = set_rows nr_rows left_csv in
let right_csv = set_rows nr_rows right_csv in
(* Square off the left CSV. *)
let left_csv = square left_csv in
(* Prepend the right CSV rows with the left CSV rows. *)
List.map (
fun (left_row, right_row) -> List.append left_row right_row
) (List.combine left_csv right_csv)
let to_array csv =
Array.of_list (List.map Array.of_list csv)
let of_array csv =
List.map Array.to_list (Array.to_list csv)
let associate header data =
let nr_cols = List.length header in
let rec trunc = function
| 0, _ -> []
| n, [] -> "" :: trunc (n-1, [])
| n, (x :: xs) -> x :: trunc (n-1, xs)
in
List.map (
fun row ->
let row = trunc (nr_cols, row) in
List.combine header row
) data
let save_out ?(separator = ',') chan csv =
(* Quote a single CSV field. *)
let quote_field field =
if String.contains field separator ||
String.contains field '\"' ||
String.contains field '\n'
then (
let buffer = Buffer.create 100 in
Buffer.add_char buffer '\"';
for i = 0 to (String.length field) - 1 do
match field.[i] with
'\"' -> Buffer.add_string buffer "\"\""
| c -> Buffer.add_char buffer c
done;
Buffer.add_char buffer '\"';
Buffer.contents buffer
)
else
field
in
let separator = String.make 1 separator in
List.iter (fun line ->
output_string chan (String.concat separator
(List.map quote_field line));
output_char chan '\n') csv
let print ?separator csv =
save_out ?separator stdout csv; flush stdout
let save ?separator file csv =
let chan = open_out file in
save_out ?separator chan csv;
close_out chan
let save_out_readable chan csv =
(* Escape all the strings in the CSV file first. *)
(* XXX Why are we doing this? I commented it out anyway.
let csv = List.map (List.map String.escaped) csv in
*)
(* Find the width of each column. *)
let widths =
(* Don't consider rows with only a single element - typically
* long titles.
*)
let csv = List.filter (function [_] -> false | _ -> true) csv in
(* Square the CSV file - makes the next step simpler to implement. *)
let csv = square csv in
match csv with
| [] -> []
| row1 :: rest ->
let lengths_row1 = List.map String.length row1 in
let lengths_rest = List.map (List.map String.length) rest in
let max2rows r1 r2 =
let rp =
try List.combine r1 r2
with
Invalid_argument "List.combine" ->
failwith (sprintf "Csv.save_out_readable: internal error: length r1 = %d, length r2 = %d" (List.length r1) (List.length r2)) in
List.map (fun ((a : int), (b : int)) -> max a b) rp
in
List.fold_left max2rows lengths_row1 lengths_rest in
(* Print out each cell at the correct width. *)
let rec repeat f = function
| 0 -> ()
| i -> f (); repeat f (i-1)
in
List.iter (
function
| [cell] -> (* Single column. *)
output_string chan cell;
output_char chan '\n'
| row -> (* Other. *)
(* Pair up each cell with its max width. *)
let row =
let rec loop = function
| ([], _) -> []
| (_, []) -> failwith "Csv.save_out_readable: internal error"
| (cell :: cells, width :: widths) ->
(cell, width) :: loop (cells, widths)
in
loop (row, widths) in
List.iter (
fun (cell, width) ->
output_string chan cell;
let n = String.length cell in
repeat (fun () -> output_char chan ' ') (width - n + 1)
) row;
output_char chan '\n'
) csv
let print_readable = save_out_readable stdout
|