File: pdf.ml

package info (click to toggle)
camlpdf 0.5-1
  • links: PTS, VCS
  • area: non-free
  • in suites: squeeze, wheezy
  • size: 1,516 kB
  • ctags: 2,689
  • sloc: ml: 18,229; ansic: 139; makefile: 139
file content (768 lines) | stat: -rw-r--r-- 25,876 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
(* \part{CamlPDF}\chaptertitle{PDF}{Representing PDF files} *)

(* This module declares a data type which represents an Adobe PDF document,
and defines various simple operations on it. *)
open Utility
open Pdfio

(* \section{Data Type for Representing PDF Documents} *)

(* Predicate on characters delimiting entities. *)
let is_delimiter = function
  | '(' | ')' | '<' | '>' | '[' | ']' | '{' | '}' | '%' | '/' -> true
  | _ -> false

(* \intf Streams of binary data, byte-addressable, can either be in memory ([Got]) or
still in an input channel ([ToGet]). *)
type stream =
  | Got of bytestream
  | ToGet of input * int64 * int64 (*r input, position, length *)

(* \intf Type for individual PDF objects. A [Name] includes the initial `/'. A
[Stream] consists of a reference to a pair of the stream dictionary (another
[pdfobject]) and a [stream]. Thus a [pdfobject] is technically mutable.  However,
at the user level, it is intended to be immutable: changes should be limited to
encoding and decoding of the stream.

Note that pdfobjects are not always amenable to polymorphic equality testing,
since the [Io.input] in the [ToGet] part of a [stream] contains functional
values. *)
type pdfobject =
  | Null
  | Boolean of bool
  | Integer of int
  | Real of float
  | String of string
  | Name of string 
  | Array of pdfobject list
  | Dictionary of (string * pdfobject) list
  | Stream of (pdfobject * stream) ref
  | Indirect of int

(*IF-OCAML*)
(* Pdf objects are stored in an efficient map structure. *)
module PdfObjMap =
  Map.Make
    (struct
       type t = int
       let compare = compare
    end)

let pdfobjmap_find = PdfObjMap.find
let pdfobjmap_mapi = PdfObjMap.mapi
let pdfobjmap_iter = PdfObjMap.iter
let pdfobjmap_remove = PdfObjMap.remove
let pdfobjmap_add = PdfObjMap.add
let pdfobjmap_empty = PdfObjMap.empty

(* An object is either lexed, or needs to be lexed from a position in the
input. *)
type objectdata =
  | Parsed of pdfobject
  | ToParse

(* We hold the maximum object number in use, [maxobjnum] to allow easy
production of new keys for the map. *)
type pdfobjects =
  {mutable maxobjnum : int;
   mutable parse : (PdfObjMap.key -> pdfobject) option;
   mutable objectsfromstream : int list;
   mutable pdfobjects : (objectdata ref * int) PdfObjMap.t} (*r int is generation *)
(*ENDIF-OCAML*)

(*i*)(*F#

(* An object is either lexed, or needs to be lexed from a position in the
input. *)
type objectdata =
  | Parsed of pdfobject
  | ToParse

let PdfObjMap : (int, objectdata ref * int) Map.Provider = Map.Make compare

let pdfobjmap_find = PdfObjMap.find
let pdfobjmap_mapi = PdfObjMap.mapi
let pdfobjmap_iter = PdfObjMap.iter
let pdfobjmap_remove = PdfObjMap.remove
let pdfobjmap_add = PdfObjMap.add
let pdfobjmap_empty = PdfObjMap.empty

type pdfobjects =
  {mutable maxobjnum : int;
   mutable parse : (int -> pdfobject) option;
   mutable objectsfromstream : int list;
   mutable pdfobjects : Tagged.Map<int, (objectdata ref * int), System.Collections.Generic.IComparer<int>>} (*r int is generation *)

F#*)(*i*)


(* \intf PDF Document. The major and minor version numbers, the root object number,
the list of objects and the trailer dictionary.

This represents the contents of a PDF file's user objects (object streams and
other mechanisms involved only in reading and writing are abstracted away). *)
type pdfdoc =
  {mutable major : int; 
   mutable minor : int;
   mutable root : int;
   mutable objects : pdfobjects; 
   mutable trailerdict : pdfobject} 

let set_streamobjects pdf os =
  pdf.objects.objectsfromstream <- os

let get_streamobjects pdf =
  pdf.objects.objectsfromstream

(* \intf The null PDF document. *)
let empty () =
  {major = 1;
   minor = 0;
   root = 0;
   objects = {maxobjnum = 0; parse = None; objectsfromstream = []; pdfobjects = pdfobjmap_empty};
   trailerdict = Dictionary []}

(* \intf General exception for low-level errors. *)
exception PDFError of string

(* \section{Utility functions} *)

(* \intf Predicate on those characters considered whitespace in PDF files. *)
let is_whitespace = function
  | '\000' | '\009' | '\010' | '\012' | ' ' | '\013' -> true
  | _ -> false

(* \intf Get a stream from disk if it hasn't already been got. *)
let getstream = function
  | Stream ({contents = (d, ToGet (i, o, l))} as stream) ->
      if l = 0L then stream := (d, Got (mkstream 0)) else
        let s = mkstream (i64toi l) in
          begin try
            (*IF-OCAML*)i.seek_in o; (*ENDIF-OCAML*)
            (*i*)(*F#i.seek_in (i64toi o); F#*)(*i*)
            for c = 0 to i64toi l - 1 do
              match i.input_byte () with
              | b when b = Pdfio.no_more -> dpr "H"; raise End_of_file
              | b -> sset s c b
            done;
            stream := (d, Got s)
          with
            End_of_file ->
              raise (PDFError "Pdf.getstream: can't read stream.")
          end
  | Stream _ -> ()
  | _ -> raise (PDFError "Pdf.getstream: not a stream")

let recurse_array (f : pdfobject -> pdfobject) elts =
  Array (map f elts)

(* \intf Similarly for dictionaries. *)
let recurse_dict (f : pdfobject -> pdfobject) elts =
  let names, objects = split elts in
    let objects' = map f objects in
      Dictionary (combine names objects')

(* \intf Return a float from a PDF number. *)
let getnum = function
  | Real a -> a
  | Integer a -> float a
  | _ -> raise (PDFError "Pdf.getnum: not a number")

(* \intf Parse a PDF rectangle data structure. Returns min x, min y, max x, max y. *)
let parse_rectangle = function
  | Array [a; b; c; d] ->
      begin try
        let x, y, x', y' =
          getnum a, getnum b, getnum c, getnum d
        in
          fmin x x', fmin y y', fmax x x', fmax y y'
      with
        PDFError _ -> raise (PDFError "Pdf.parse_rectangle: bad rectangle")
      end
  | _ -> raise (PDFError "Pdf.parse_rectangle: not a rectangle")

let change_obj doc i obj =
  match fst (pdfobjmap_find i doc.objects.pdfobjects) with
  | {contents = Parsed _} -> assert false
  | {contents = ToParse} as r -> r := Parsed obj

(* Parse an object [n] in document [pdf], updating the object in the document so
it is ready-parsed should it be required again. *)
let parse_lazy pdf n =
  match pdf.objects.parse with
  | None -> raise (Assert_failure ("Pdf.parse_lazy", 0, 0))
  | Some f ->
      let obj = f n in
        change_obj pdf n obj;
        obj

(* \intf Look up an object. On an error return [Pdf.Null] *)
let lookup_obj doc i =
  try
    match fst (pdfobjmap_find i doc.objects.pdfobjects) with
    | {contents = Parsed obj} -> obj
    | {contents = ToParse} -> parse_lazy doc i
  with
    Not_found -> dpr "2H"; Null

let catalog_of_pdf pdf =
  try lookup_obj pdf pdf.root with
    Not_found -> raise (PDFError "No catalog")

(* \intf Given any pdf document and object, follow indirections to yield a
direct object. A hanging indirect is defined as [Null]. *)
let rec direct pdf = function
  | Indirect i ->
      begin try
        match fst (pdfobjmap_find i pdf.objects.pdfobjects) with
        | {contents = Parsed pdfobject} -> direct pdf pdfobject
        | {contents = ToParse} -> parse_lazy pdf i
      with
        Not_found -> dpr "2I"; Null
      end
  | obj -> obj

(* \intf Apply a function on Stream objects to all streams in a PDF document. We
assume stream dictionaries don't have indirect references to an object which
itself contains a stream. *)
let map_stream f pdf =
  let rec map_stream_inner f i = function
    | {contents = Parsed (Stream _ as stream)}, g -> ref (Parsed (f stream)), g
    | {contents = Parsed obj}, g -> ref (Parsed (obj)), g
    | {contents = ToParse}, g -> map_stream_inner f i (ref (Parsed (parse_lazy pdf i)), g)
  in
    let objects' =
      {pdf.objects with
         pdfobjects = pdfobjmap_mapi (map_stream_inner f) pdf.objects.pdfobjects}
    in
      {pdf with objects = objects'}

(* \intf Iterate over a stream. *)
let iter_stream f pdf =
  let rec iter_stream_inner f i = function
    | {contents = Parsed (Stream _ as stream)}, g -> f stream
    | {contents = ToParse} as r, g ->
        r := Parsed (parse_lazy pdf i);
        iter_stream_inner f i (r, g)
    | _ -> ()
  in
    pdfobjmap_iter (iter_stream_inner f) pdf.objects.pdfobjects

(* \intf Lookup a key in a dictionary, following indirect references,  returning
[None] on any failure. This works on both plain dictionaries and streams. *)
let lookup_direct pdf key dict =
  match direct pdf dict with
  | Dictionary d | Stream {contents = (Dictionary d, _)} ->
      begin match lookup key d with
      | None -> None
      | Some o -> Some (direct pdf o)
      end
  | _ -> None

(* \intf Look up under a key and its alternate. Return the value associated with the key that worked, or [None] if neither did. *)
let lookup_direct_orelse pdf k k' d =
  match lookup_direct pdf k d with
  | None -> lookup_direct pdf k' d
  | result -> result

(* \intf Look something up in a dictionary, failing with given exception if not
found. We make direct both the dictionary and the result of the lookup. This
also allows us to look things up in a stream dictionary transparently. *)
let lookup_exception (exp : exn) pdf key dict =
  let dict' =
    match direct pdf dict with
    | Dictionary d | Stream {contents = Dictionary d, _} -> d
    | o -> raise (PDFError "not a dictionary")
  in
    match lookup key dict' with
    | None -> dpr "G"; raise exp
    | Some v -> direct pdf v

(* \intf A specialised one raising [PDFError]. *)
let lookup_fail text =
  lookup_exception (PDFError text)

(* \intf Parse a matrix. *)
let parse_matrix pdf name dict =
  match lookup_direct pdf name dict with
  | None -> Transform.i_matrix
  | Some (Array [a; b; c; d; e; f]) ->
      let a = getnum a and b = getnum b and c = getnum c
      and d = getnum d and e = getnum e and f = getnum f in
        {Transform.a = a; Transform.b = b; Transform.c = c;
         Transform.d = d; Transform.e = e; Transform.f = f}
  | _ -> raise (PDFError "Malformed matrix")

(* \intf Make a matrix *)
let make_matrix tr =
  Array
    [Real tr.Transform.a; Real tr.Transform.b; Real tr.Transform.c;
     Real tr.Transform.d; Real tr.Transform.e; Real tr.Transform.f]

(* \intf Iterate over the objects in a document, in order of increasing object
number. *)
let objiter f doc =
  let f' k v =
    match v with
    | {contents = Parsed obj}, _ -> f k obj
    | {contents = ToParse}, _ -> f k (parse_lazy doc k)
  in
    pdfobjmap_iter f' doc.objects.pdfobjects

(* \intf Same, but also pass generation number. *)
let objiter_gen f doc =
  let f' k v =
    match v with
    | {contents = Parsed obj}, g -> f k g obj
    | {contents = ToParse}, g -> f k g (parse_lazy doc k)
  in
    pdfobjmap_iter f' doc.objects.pdfobjects

(* \intf Map on objects. *)
let objmap f doc =
  let f' i = function
    | {contents = Parsed obj}, g -> ref (Parsed (f obj)), g
    | {contents = ToParse}, g -> ref (Parsed (parse_lazy doc i)), g
  in
    doc.objects <-
       {doc.objects with
          pdfobjects = pdfobjmap_mapi f' doc.objects.pdfobjects}

let maxobjnum pdf =
  pdf.objects.maxobjnum

(* Return a list of object numbers. *)
let objnumbers pdf =
  let keys = ref [] in
    objiter (fun k _ -> keys =| k) pdf;
    rev !keys

(* \intf Cardinality of object set. O(n). *)
let objcard pdf =
  let card = ref 0 in
    objiter (fun _ _ -> incr card) pdf;
    !card

(* Remove an object. *)
let removeobj doc o =
  {doc with objects =
    {doc.objects with pdfobjects = pdfobjmap_remove o doc.objects.pdfobjects}}

(* Return a list of (k, v) pairs. *)
let list_of_objs doc =
  let objs = ref [] in
    objiter (fun k v -> objs =| (k, Parsed v)) doc;
    !objs

(* \intf Add an object, given an object number. *)
let addobj_given_num doc (num, obj) =
  doc.objects.maxobjnum <- max doc.objects.maxobjnum num;
  doc.objects.pdfobjects <- pdfobjmap_add num (ref (Parsed obj), 0) doc.objects.pdfobjects

(* \intf Add an object. We use the first number larger than the maxobjnum, and update that. *)
let addobj doc obj =
  let num = doc.objects.maxobjnum + 1 in
    addobj_given_num doc (num, obj);
    num

(* Make a objects entry from a list of (number, object) pairs. *)
let objects_of_list parse l =
  let maxobj = ref 0
  and map = ref pdfobjmap_empty in
    iter
      (fun (k, v) ->
         maxobj := max !maxobj k;
         map := pdfobjmap_add k v !map)
      l;
    {parse = parse; pdfobjects = !map; objectsfromstream = []; maxobjnum = !maxobj}

(* Find the page reference numbers, given the top level node of the page tree *)
let rec page_reference_numbers_inner pdf pages_node node_number =
  match lookup_direct pdf "/Type" pages_node with
  | Some (Name "/Pages") | None ->
      begin match lookup_direct pdf "/Kids" pages_node with
      | Some (Array elts) ->
          flatten
            (map
              (function
               | Indirect i ->
                   page_reference_numbers_inner
                     pdf (direct pdf (Indirect i)) i
               | _ -> raise (PDFError "badly formed page tree A"))
              elts)
      | _ -> raise (PDFError "badly formed page tree B")
      end
  | Some (Name "/Page") -> [node_number]
  | _ -> raise (PDFError "badly formed page tree C")

let page_reference_numbers pdf =
  let root = lookup_obj pdf pdf.root in
    let pages_node =
        match lookup_direct pdf "/Pages" root with
        | Some p -> p
        | None -> raise (PDFError "badly formed page tree D")
    in
      page_reference_numbers_inner pdf pages_node ~-1

(* Renumber an object given a change table (A hash table mapping old to new
numbers). *)
let rec renumber_object_parsed (pdf : pdfdoc) changes obj =
  match obj with
  | Indirect i ->
      let i' =
        match tryfind changes i with
        | Some x -> x
        | None -> i (*r A dangling indirect is valid. *)
      in
        Indirect i'
  | Array a ->
      recurse_array (renumber_object_parsed pdf changes) a
  | Dictionary d ->
      recurse_dict (renumber_object_parsed pdf changes) d
  | Stream {contents = (p, s)} ->
      Stream {contents = renumber_object_parsed pdf changes p, s}
  | pdfobject -> pdfobject

let renumber_object pdf changes objnum = function
  | ToParse ->
      renumber_object_parsed pdf changes (parse_lazy pdf objnum)
  | Parsed obj ->
      renumber_object_parsed pdf changes obj

(* Renumber a PDF's objects to [1]\ldots [n]. *)

(* Calculate the substitutions required to renumber the document. *)
let changes pdf =
  let card = objcard pdf in
    let order = ilist_fail_null 1 card
    and change_table = Hashtbl.create card in
      List.iter2 (Hashtbl.add change_table) (objnumbers pdf) order;
      change_table
      
(* Perform all renumberings given by a change table. *)
let renumber change_table pdf =
  let root' =
    match tryfind change_table pdf.root with Some x -> x | None -> pdf.root
  and trailerdict' =
    renumber_object pdf change_table 0 (Parsed pdf.trailerdict)
  and objects' =
    let nums, objs = split (list_of_objs pdf) in
      let objs' =
        map2 (renumber_object pdf change_table) nums objs
      and nums' =
        map (function k -> match tryfind change_table k with Some x -> x | None -> k) nums
      in
        objects_of_list
          pdf.objects.parse
          (combine nums' (map (fun x -> ref (Parsed x), 0) objs'))
  in
    {pdf with
     root = root';
     objects = objects';
     trailerdict = trailerdict'}
 
(* \intf Renumber the objects (including root and trailer dictionary) in a list of
pdfs so they are mutually exclusive. We iterate over the key lists to build
a list of change tables which are applied to the input PDFs. NOTE: This can't
be used on PDFs where the generation numbers still matter (i.e before
decryption). *)
let renumber_pdfs pdfs =
  let keylists = map objnumbers pdfs
  and bse = ref 1
  and tables = ref [] in
    iter
      (fun k ->
         let length = length k in
           let table = Hashtbl.create length in
             List.iter2 (Hashtbl.add table) k (ilist !bse (!bse + length - 1));
             tables =| table;
             bse += length)
      keylists;
    map2 renumber (rev !tables) pdfs

(* Used for sets of object numbers. *)

(*IF-OCAML*)
module RefSet =
  Set.Make
    (struct
       type t = int
       let compare = compare
    end)

let refset_add = RefSet.add
let refset_empty = RefSet.empty
let refset_elements = RefSet.elements
(*ENDIF-OCAML*)

(*i*)(*F#
let RefSet : int Set.Provider = Set.Make compare

let refset_add = RefSet.add
let refset_empty = RefSet.empty
let refset_elements = RefSet.elements
F#*)(*i*)

(* Give a list of object numbers referenced in a given [pdfobject] *)
let rec referenced no_follow_entries no_follow_contains pdf found i = function
  | Parsed (Indirect i) ->
      if not (RefSet.mem i !found) then
        begin
          let obj = 
            try lookup_obj pdf i with
              Not_found -> dpr "2M"; Null
          in
            match obj with
            | Dictionary d ->
                if not (mem true (map (mem' no_follow_contains) d)) then
                  begin
                  found := RefSet.add i !found;
                  referenced no_follow_entries no_follow_contains pdf found i (Parsed obj)
                  end
            | _ ->
              found := RefSet.add i !found;
              referenced no_follow_entries no_follow_contains pdf found i (Parsed obj)
        end
  | Parsed (Array a) ->
      iter
        (referenced no_follow_entries no_follow_contains pdf found i)
        (map (fun x -> Parsed x) a)
  | Parsed (Dictionary d) ->
      iter
        (referenced no_follow_entries no_follow_contains pdf found i)
        (map
          (fun x -> Parsed (snd x))
          (lose (fun (k, _) -> mem k no_follow_entries) d))
  | Parsed (Stream s) ->
      referenced no_follow_entries no_follow_contains pdf found i (Parsed (fst !s))
  | Parsed _ ->
      ()
  | ToParse ->
      referenced no_follow_entries no_follow_contains pdf found i (Parsed (parse_lazy pdf i))

(* Nullify all references to page objects which are no longer in the page tree.
This prevents (for instance) annotations on a page referencing a deleted page,
thus preventing the deleted page's objects from being recovered during garbage
collection. *)
let nullify_deleted_page_references pdf =
  let rec nullify numbers = function
    | Indirect i when mem i numbers -> Null
    | Array elts -> recurse_array (nullify numbers) elts
    | Dictionary elts -> recurse_dict (nullify numbers) elts
    | Stream {contents = (p, s)} -> Stream {contents = nullify numbers p, s}
    | x -> x
  and page_object_numbers =
    let nums = ref [] in
      objiter
        (function objnum ->
           function
             | Dictionary d when lookup "/Type" d  = Some (Name "/Page") ->
                 nums := objnum :: !nums
             | x -> ())
        pdf;
      !nums
  in
    objmap (nullify (setminus page_object_numbers (page_reference_numbers pdf))) pdf

(* \intf Remove any unreferenced objects. *)
let remove_unreferenced pdf =
  nullify_deleted_page_references pdf;
  let found = ref RefSet.empty in
    referenced [] [] pdf found pdf.root (Parsed (lookup_obj pdf pdf.root));
    referenced [] [] pdf found 0 (Parsed pdf.trailerdict);
    found := RefSet.add pdf.root !found;
    let eltnumbers = RefSet.elements !found in
      (* If not found, just ignore. *)
      let elements =
        map
          (fun n -> try lookup_obj pdf n with Not_found -> dpr "2N"; Null)
          eltnumbers
      in
        pdf.objects <-
          {maxobjnum = 0;
           parse = pdf.objects.parse;
           objectsfromstream = pdf.objects.objectsfromstream;
           pdfobjects = pdfobjmap_empty};
        iter (addobj_given_num pdf) (combine eltnumbers elements)

(* \intf Objects referenced from a given one. *)
let objects_referenced no_follow_entries no_follow_contains pdf pdfobject =
  let set = ref RefSet.empty in
    referenced no_follow_entries no_follow_contains pdf set 0 (Parsed pdfobject);
    RefSet.elements !set

(* \intf The same, but return the objects too. *)
let objects_referenced_and_objects no_follow_entries no_follow_contains pdf pdfobject =
  let nums =
    objects_referenced no_follow_entries no_follow_contains pdf pdfobject
  in
    combine nums (map (lookup_obj pdf) nums)

(* \intf Remove a dictionary entry. Also works for streams. *)
let rec remove_dict_entry dict key =
  match dict with
  | Dictionary d -> Dictionary (remove key d)
  | Stream ({contents = (dict', stream)} as s) ->
      s := (remove_dict_entry dict' key, stream);
      Stream s
  | _ -> raise (PDFError "remove_dict_entry: not a dictionary")

(* \intf Replace dict entry, raising [Not_found] if it's not there. Also works
for streams.*)
let rec replace_dict_entry dict key value =
  match dict with
  | Dictionary d -> Dictionary (replace key value d)
  | Stream ({contents = (dict', stream)} as s) ->
      s := (replace_dict_entry dict' key value, stream);
      Stream s
  | _ -> raise (PDFError "replace_dict_entry: not a dictionary.")

(* \intf Add a dict entry, replacing if there. Also works for streams. *)
let rec add_dict_entry dict key value =
  match dict with
  | Dictionary d -> Dictionary (add key value d)
  | Stream ({contents = (dict', stream)} as s) ->
      s := (add_dict_entry dict' key value, stream);
      Stream s
  | _ -> raise (PDFError "add_dict_entry: not a dictionary.")

(* Find the contents of a stream as a bytestream. *)
let rec bigarray_of_stream s =
  getstream s;
  match s with
  | Stream {contents = _, Got bytestream} -> bytestream
  | _ -> raise (PDFError "couldn't extract raw stream")

(* \intf Given a dictionary and a prefix (e.g gs), return a name, starting with the
prefix, which is not already in the dictionary (e.g /gs0). *)
let unique_key prefix obj =
  let elts = match obj with
    | Dictionary es
    | Stream {contents = Dictionary es, _} -> es
    | _ -> raise (PDFError "unique_key: Not a dictionary or stream")
  in
    let names = fst (split elts)
    and name_of_num n = "/" ^ prefix ^ string_of_int n
    and num = ref 0 in
      while mem (name_of_num !num) names do incr num done;
      name_of_num !num

(* \intf Given a PDF and potential filename, calculate an MD5 string and build a
suitable /ID entry from it. *)
let generate_id (pdf : pdfdoc) (path : string) =
  (*IF-OCAML*)
  let gettime () = Unix.gettimeofday () in
  (*ENDIF-OCAML*)
  (*F#
  let gettime () = Sys.time () in
  F#*)
  let d =
    digest (path ^ string_of_float (gettime ()))
  in
    Array [String d; String d]

(* Find all the indirect numbers reachable from an entry in a dictionary,
including the indirect of that dictionary entry, if it's an indirect. *)
let reference_numbers_of_dict_entry pdf dict entry =
  match dict with
  | Dictionary d ->
      begin match lookup entry d with
      | Some x -> objects_referenced [] [] pdf x
      | None ->
          raise (PDFError "reference_numbers_of_dict_entry: no entry")
      end
  | _ ->
      raise (PDFError "reference_numbers_of_dict_entry: not a dictionary")

(* Find the indirect reference given by the value associated with a key in a
dictionary. *)
let find_indirect key dict =
  match dict with
  | Dictionary d ->
      begin match lookup key d with
      | Some (Indirect i) -> Some i
      | _ -> None
      end
  | _ -> raise (PDFError "find_indirect: not a dictionary")

(* Name tree functionality *)

(* Look something up in a name tree. *)
let rec nametree_lookup_kids pdf k = function
  | Array (h::t) ->
      begin match nametree_lookup pdf k h with
      | None ->
          nametree_lookup_kids pdf k (Array t)
      | Some result -> Some result
      end
  | Array [] -> None
  | _ -> raise (PDFError "malformed name tree")

and array_lookup pdf k = function
  | Array elts ->
      lookup k (pairs_of_list elts)
  | _ -> raise (PDFError "Bad lookup array")

and nametree_lookup pdf k dict =
  match lookup_direct pdf "/Limits" dict with
  | Some (Array [l;r]) ->
      if k < l || k > r then None else
      begin match lookup_direct pdf "/Kids" dict with
      | Some kids ->
          (* Intermediate node *)
          nametree_lookup_kids pdf k kids
      | None ->
          match lookup_direct pdf "/Names" dict with
          | Some names ->
              (* leaf node *)
              array_lookup pdf k names
          | None ->
              raise (PDFError "Malformed name tree entry")
      end
  | None ->
      begin match lookup_direct pdf "/Kids" dict with
      | Some kids ->
          (* Root node with kids *)
          nametree_lookup_kids pdf k kids
      | None ->
          match lookup_direct pdf "/Names" dict with
          | Some names ->
              (* Root node with names *)
              array_lookup pdf k names
          | None ->
              raise (PDFError "Missing name tree entry")
      end
  | _ -> raise (PDFError "Malformed name tree")

(* Return an ordered list of all the (k, v) pairs in a tree *)
let rec contents_of_nametree pdf tree =
  match lookup_direct pdf "/Names" tree with
  | Some (Array names) ->
      let rec pairs_of_list prev = function
      | [] -> rev prev
      | [_] -> raise (PDFError "contents_of_nametree: bad /Names")
      | k::v::r -> pairs_of_list ((k, v)::prev) r
      in
        pairs_of_list [] names
  | _ ->
      match lookup_direct pdf "/Kids" tree with
      | Some (Array kids) ->
          flatten (map (contents_of_nametree pdf) kids)
      | _ -> raise (PDFError "contents_of_nametree: neither names nor kids")

let copy_pdf from =
  {major = from.major;
   minor = from.minor;
   root = from.root;
   objects = from.objects;
   trailerdict = from.trailerdict}

let deep_copy from =
  let pdf = copy_pdf from in
    objmap
      (function
       | Stream {contents = (dict, Got stream)} -> Stream (ref (dict, Got (copystream stream)))
       | x -> x)
      pdf;
    pdf