File: cpdftoc.ml

package info (click to toggle)
cpdf 2.8.1-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 6,828 kB
  • sloc: ml: 34,724; makefile: 65; sh: 45
file content (432 lines) | stat: -rw-r--r-- 19,453 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
open Pdfutil

(* We allow \n in titles. Split for typesetter. *)
let rec split_toc_title_inner a = function
  | '\\'::'n'::r -> rev a :: split_toc_title_inner [] r
  | x::xs -> split_toc_title_inner (x::a) xs
  | [] -> [rev a]

let split_toc_title = split_toc_title_inner []

(* And for new bookmark for TOC, change \\n to \n *)
let rec real_newline = function
  | '\\'::'n'::r -> '\n'::real_newline r
  | x::r -> x::real_newline r
  | [] -> []

let width_table_cache = null_hash ()

let rec width_of_runs runs =
  match runs with
  | Cpdftype.Font (id, f, fontsize)::Cpdftype.Text t::more ->
      let width_table =
        match Hashtbl.find width_table_cache (id, fontsize) with
        | w -> w
        | exception Not_found ->
            let ws = Cpdftype.font_widths id f fontsize in Hashtbl.add width_table_cache (id, fontsize) ws; ws
      in
        Cpdftype.width_of_string width_table t +. width_of_runs more
  | [] -> 0.
  | _ -> failwith "width_of_runs"

(* Run of Font / Text elements from a fontpack and UTF8 text *)
let of_utf8 fontpack fontsize t =
  let codepoints = Pdftext.codepoints_of_utf8 t in
  let fonted = option_map (Cpdfembed.get_char fontpack) codepoints in
  let collated = Cpdfembed.collate_runs fonted in
    flatten
      (map
        (function
         | [] -> []
         | (_, n, font) as h::t ->
             let charcodes = map (fun (c, _, _) -> char_of_int c) (h::t) in
               [Cpdftype.Font (string_of_int n, font, fontsize); Cpdftype.Text charcodes])
        collated)

(* Cpdftype codepoints from a font and PDFDocEndoding string *)
let of_pdfdocencoding fontpack fontsize t =
  of_utf8 fontpack fontsize (Pdftext.utf8_of_pdfdocstring t)

(* Remove characters until it is below the length. Then remove three more and
   add dots for an ellipsis *)
let rec shorten_text_inner l t =
  match rev t with
  | Cpdftype.Text text::Cpdftype.Font (id, f, fs)::more ->
      let width_table =
        match Hashtbl.find width_table_cache (id, fs) with
        | w -> w
        | exception Not_found ->
            let ws = Cpdftype.font_widths id f fs in Hashtbl.add width_table_cache (id, fs) ws; ws
      in
      if Cpdftype.width_of_string width_table text > l then
        shorten_text_inner l (rev (Cpdftype.Text (all_but_last text)::Cpdftype.Font (id, f, fs)::more))
      else
        t
  | _ -> t

let shorten_text fontpack fontsize l t =
  let short = shorten_text_inner l t in
    if short = t then t else
      let charcode, dotfontnum, dotfont =
        unopt (Cpdfembed.get_char fontpack (int_of_char '.'))
      in
      let charcode = char_of_int charcode in
        short @ [Cpdftype.Font (string_of_int dotfontnum, dotfont, fontsize); Cpdftype.Text [charcode; charcode; charcode]]

(* Calculate the used codepoints *)
let used pdf fastrefnums labels title marks =
  let codepoints = null_hash () in
  Hashtbl.add codepoints (int_of_char '.') ();
  let addtext t =
    iter
      (fun c -> Hashtbl.replace codepoints c ())
      (Pdftext.codepoints_of_utf8 (Pdftext.utf8_of_pdfdocstring t))
  in
    iter (fun c -> Hashtbl.replace codepoints c ()) (Pdftext.codepoints_of_utf8 title);
    iter
      (fun m ->
        addtext m.Pdfmarks.text;
        let pnum = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target in
        let labeltext =
          try Pdfpagelabels.pagelabeltext_of_pagenumber pnum labels with Not_found -> string_of_int pnum
        in
          addtext labeltext)
      marks;
    codepoints

(* Make a dot leader *)
let make_dots space fontpack fontsize =
  let dotruns = of_utf8 fontpack fontsize "." in
  let dotwidth = width_of_runs dotruns in
  let runs = flatten (many dotruns (int_of_float (floor (space /. dotwidth)))) in
  let remainder = space -. width_of_runs runs in
    [Cpdftype.HGlue remainder] @ runs

(* Prepend structure tree items. FIXME: What to do if not present? Currently we do nothing. *)
let prepend_structitems pdf items =
  match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"; "/K"] with
  | Some (Pdf.Array a) ->
      Pdf.replace_chain pdf ["/Root"; "/StructTreeRoot"; "/K"] (Pdf.Array (items @ a))
  | Some (Pdf.Dictionary d) ->
      Pdf.replace_chain pdf ["/Root"; "/StructTreeRoot"; "/K"] (Pdf.Array (items @ [Pdf.Dictionary d]))
  | _ -> ()

(* FIXME Would be better with a Pdf.remove_chain *)
let remove_parent_tree_next_key pdf =
  match Pdf.lookup_obj pdf pdf.Pdf.root with
  | Pdf.Dictionary d ->
      begin match lookup "/StructTreeRoot" d with
      | Some (Pdf.Indirect i) ->
          Pdf.addobj_given_num pdf (i, Pdf.remove_dict_entry (Pdf.lookup_obj pdf i) "/ParentTreeNextKey")
      | Some (Pdf.Dictionary d2) ->
          let newstroot = Pdf.remove_dict_entry (Pdf.Dictionary d2) "/ParentTreeNextKey" in
          let newroot = Pdf.add_dict_entry (Pdf.Dictionary d) "/StructTreeRoot" newstroot in
            Pdf.addobj_given_num pdf (pdf.Pdf.root, newroot)
      | _ -> ()
      end
  | _ -> ()

(* FIXME Again, replace_chain would be much better here if it could deal with a final indirect. *)
let add_to_parent_tree pdf p =
  match Pdf.lookup_chain pdf (Pdf.lookup_obj pdf pdf.Pdf.root) ["/StructTreeRoot"; "/ParentTree"] with
  | Some tree ->
      let t = Pdftree.read_number_tree pdf tree in
      let n = match t with [] -> 0 | l -> int_of_string (fst (last l)) + 1 in
      let newtree = Pdftree.build_name_tree true pdf ((string_of_int n, p)::t) in
        begin match Pdf.lookup_direct pdf "/StructTreeRoot" (Pdf.lookup_obj pdf pdf.Pdf.root) with
        | Some (Pdf.Dictionary d) ->
            begin match lookup "/ParentTree" d with
            | Some (Pdf.Indirect i) ->
                Pdf.addobj_given_num pdf (i, newtree)
            | Some (Pdf.Dictionary d) ->
                let i = Pdf.addobj pdf newtree in
                  Pdf.replace_chain pdf ["/Root"; "/StructTreeRoot"; "/ParentTree"] (Pdf.Indirect i)
            | _ -> ()
            end
        | _ -> ()
        end;
        n
  | None -> 0

(* Make sure that there is an existing structure tree suitable for us to merge
   into. Check for /StructTreeRoot. If there, nothing to do. Otherwise, build
   <</Type/StructTreeRoot/ParentTree .../K[]>>. ParentTree and K actually
   optional, but it's easier if we assume they are there. *)
let ensure_minimal_struct_tree pdf =
  match Pdf.lookup_chain pdf (Pdf.lookup_obj pdf pdf.Pdf.root) ["/StructTreeRoot"] with
  | Some _ -> ()
  | None ->
      let pt = Pdf.addobj pdf (Pdf.Dictionary [("/Nums", Pdf.Array [])]) in
      let str = Pdf.Dictionary [("/Type", Pdf.Name "/StructTreeRoot"); ("/ParentTree", Pdf.Indirect pt); ("/K", Pdf.Array [])] in
        Pdf.addobj_given_num pdf (pdf.Pdf.root, (Pdf.add_dict_entry (Pdf.lookup_obj pdf pdf.Pdf.root) "/StructTreeRoot" str))

(* Typeset a table of contents with given font, font size and title. Mediabox
   copied from first page of existing PDF cropbox, or mediabox if no crop box.
   Margin of 10%. Font size of title twice body font size. Null page labels
   added for TOC, others bumped up and so preserved. *)
(* TODO Fix Cpdftype to take a box not a papersize/margins combo. Then we can remove all the CropBox/Mediabox complications here.
   Then copying the boxes directly from the first page of the document is ok, and we just prefer the cropbox. Failing file
   __PDFUA/decomp/08.pdf *)
let typeset_table_of_contents ~font ~fontsize ~title ~bookmark ~dotleader ~process_struct_tree ?subformat pdf =
  let optional l = if process_struct_tree then l else [] in
  if process_struct_tree then ensure_minimal_struct_tree pdf;
  Hashtbl.clear width_table_cache;
  let marks = Pdfmarks.read_bookmarks ~preserve_actions:true pdf in
  if marks = [] then (Pdfe.log "No bookmarks, not making table of contents\n"; pdf) else
  let labels = Pdfpagelabels.read pdf in
  let refnums = Pdf.page_reference_numbers pdf in
  let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
  let codepoints = map fst (list_of_hashtbl (used pdf fastrefnums labels title marks)) in
  let fontpack =
    match font with
    | Cpdfembed.PreMadeFontPack t -> t
    | Cpdfembed.EmbedInfo {fontfile; fontname; encoding} ->
       Cpdfembed.embed_truetype pdf ~fontfile ~fontname ~codepoints ~encoding
    | Cpdfembed.ExistingNamedFont -> raise (Pdf.PDFError "Cannot use existing font with -table-of-contents")
  in
  let firstpage = hd (Pdfpage.pages_of_pagetree pdf) in
  let width, firstpage_papersize, pmaxx, pmaxy, margin =
    let width, height, xmax, ymax =
      match Pdf.parse_rectangle pdf firstpage.Pdfpage.mediabox with
        xmin, ymin, xmax, ymax -> xmax -. xmin, ymax -. ymin, xmax, ymax
    in
      width, Pdfpaper.make Pdfunits.PdfPoint width height, xmax, ymax, fmin width height *. 0.1
  in
  let firstpage_cropbox =
    match Pdf.lookup_direct pdf "/CropBox" firstpage.Pdfpage.rest with
    | Some r -> Some (Pdf.parse_rectangle pdf r)
    | None -> None
  in
  let width =
    match firstpage_cropbox with
    | Some (xmin, _, xmax, _) -> xmax -. xmin
    | None -> width 
  in
  let lines =
    map
      (fun mark ->
         let indent = float mark.Pdfmarks.level *. fontsize *. 2. in 
         let textruns = of_pdfdocencoding fontpack fontsize mark.Pdfmarks.text in
         let labelruns =
           if mark.Pdfmarks.target = NullDestination then of_pdfdocencoding fontpack fontsize "" else 
           let pnum = Pdfpage.pagenumber_of_target ~fastrefnums pdf mark.Pdfmarks.target in
           let pde = try Pdfpagelabels.pagelabeltext_of_pagenumber pnum labels with Not_found -> string_of_int pnum in
             of_pdfdocencoding fontpack fontsize pde
         in
         let textgap = width -. margin *. 2. -. indent -. width_of_runs labelruns in
         let textruns = shorten_text fontpack fontsize (textgap -. fontsize *. 3.) textruns in
         let space = textgap -. width_of_runs textruns in
         let leader =
           if dotleader && labelruns <> []
             then make_dots space fontpack fontsize
             else [Cpdftype.HGlue space]
         in
             [Cpdftype.BeginDest (mark.Pdfmarks.target, Some mark.Pdfmarks.text); Cpdftype.HGlue indent]
           @ optional [(Cpdftype.Tag ("Link", 0))] @ textruns @  optional [Cpdftype.EndTag]
           @ leader
           @ optional [Cpdftype.Tag ("Link", 0)] @ labelruns @ optional [Cpdftype.EndTag]
           @ [Cpdftype.EndDest; Cpdftype.NewLine])
      (Pdfmarks.read_bookmarks ~preserve_actions:false pdf)
  in
  let toc_pages, toc_tags =
    let title =
      let glue = Cpdftype.VGlue (fontsize *. 2.) in
        optional [Cpdftype.Tag ("P", 0)]
        @ flatten
           (map
             (fun l -> l @ [Cpdftype.NewLine])
             (map (of_utf8 fontpack (fontsize *. 2.)) (map implode (split_toc_title (explode title)))))
        @
        optional [Cpdftype.EndTag] @ [glue]
    in
    let lm, rm, bm, tm =
      match firstpage_cropbox with
      | None -> (margin, margin, margin, margin)
      | Some (cminx, cminy, cmaxx, cmaxy) ->
          (cminx +. margin, (pmaxx -. cmaxx) +. margin, cminy +. margin, (pmaxy -. cmaxy) +. margin)
    in
      let firstfont =
        hd (keep (function Cpdftype.Font _ -> true | _ -> false) (title @ flatten lines))
      in
        Cpdftype.typeset ~process_struct_tree lm rm tm bm firstpage_papersize pdf
          ([firstfont; Cpdftype.BeginDocument] @ title @ flatten lines)
  in
  let toc_pages =
    match firstpage_cropbox with
    | Some (a, b, c, d) ->
        let rect =
          Pdf.Array [Pdf.Real a; Pdf.Real b; Pdf.Real c; Pdf.Real d]
        in
          map
            (fun p -> {p with Pdfpage.rest = Pdf.add_dict_entry p.Pdfpage.rest "/CropBox" rect})
            toc_pages
    | None -> toc_pages
  in
  let original_pages = Pdfpage.pages_of_pagetree pdf in
  let toc_pages_len = length toc_pages in
  let changes = map (fun n -> (n, n + toc_pages_len)) (indx original_pages) in
  let pdf = Pdfpage.change_pages ~changes true pdf (toc_pages @ original_pages) in
  let toc_pageobjnums = take (Pdf.page_reference_numbers pdf) toc_pages_len in
  let label =
    {Pdfpagelabels.labelstyle = NoLabelPrefixOnly;
     Pdfpagelabels.labelprefix = None;
     Pdfpagelabels.startpage = 1;
     Pdfpagelabels.startvalue = 1}
  in
  (* Get indirect of top-level /Document *)
  let top_level_document =
    match subformat with Some Cpdfua.PDFUA2 ->
      begin match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"] with
      | Some d ->
          begin match Pdf.lookup_immediate "/K" d with
          | Some (Pdf.Indirect i) -> i
          | Some (Pdf.Array [Pdf.Indirect i]) -> i
          | _ -> 0
          end
      | _ -> 0
      end
    | _ -> 0
  in
  let p_struct_elem_first_page_ref = ref 0 in
  if process_struct_tree then
    begin
      let struct_tree_root =
        if top_level_document > 0 then top_level_document else
          match Pdf.lookup_immediate "/StructTreeRoot" (Pdf.lookup_obj pdf pdf.Pdf.root) with
          | Some (Pdf.Indirect i) -> i
          | _ -> 0 (* Will never happen, because we ran ensure_minimal_struct_tree *)
      in
      let p_struct_elem_first_page =
        Pdf.addobj pdf
          (Pdf.Dictionary [("/S", Pdf.Name "/P");
                           ("/Pg", Pdf.Indirect (hd toc_pageobjnums));
                           ("/K", Pdf.Array [Pdf.Integer 0]);
                           ("/P", Pdf.Indirect struct_tree_root)])
      in
      p_struct_elem_first_page_ref := p_struct_elem_first_page;
      let mcid = ref 1 in
      let link_struct_elems_for_each_page =
        map2
          (fun page pageobjnum ->
            let annot_objnums =
              match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
              | Some (Pdf.Array a) -> map (function Pdf.Indirect i -> i | _ -> 0) a
              | _ -> []
            in
            let r = map
                (fun annot_i ->
                   let r =
                     let objr = Pdf.addobj pdf (Pdf.Dictionary [("/Type", Pdf.Name "/OBJR"); ("/Obj", Pdf.Indirect annot_i)]) in
                       Pdf.addobj pdf
                         (Pdf.Dictionary [("/S", Pdf.Name "/Link");
                                          ("/K", Pdf.Array [Pdf.Integer !mcid; Pdf.Indirect objr]);
                                          ("/P", Pdf.Indirect struct_tree_root);
                                          ("/Pg", Pdf.Indirect pageobjnum)])
                  in
                    incr mcid; r)
                annot_objnums
            in
              mcid := 0; r)
          toc_pages
          toc_pageobjnums
      in
      let prepending_structitems =
        map (fun x -> Pdf.Indirect x) (p_struct_elem_first_page::flatten link_struct_elems_for_each_page)
      in
      (* Add the key and value structure item (any p, and that page's links) to the parent tree for each TOC page *)
      let toc_structure_items_per_page =
        match link_struct_elems_for_each_page with
        | h::t -> (p_struct_elem_first_page::h)::t
        | [] -> []
      in
        iter2
          (fun o ns ->
             let page = Pdf.lookup_obj pdf o in
             (* For each annotation, add a structparent entry too. *)
             let annot_objnums =
               match Pdf.lookup_direct pdf "/Annots" page with
               | Some (Pdf.Array a) -> map (function Pdf.Indirect i -> i | _ -> 0) a
               | _ -> []
             in
             (* Remove the Title P from first page list *)
             let ns2 = if length ns > length annot_objnums then tl ns else ns in
             iter3
               (fun annot_objnum n mark ->
                  let annot = Pdf.lookup_obj pdf annot_objnum in
                  let sp_num = add_to_parent_tree pdf (Pdf.Indirect n) in
                  let new_annot = Pdf.add_dict_entry annot "/StructParent" (Pdf.Integer sp_num) in
                  let a =
                    match mark.Pdfmarks.target with
                    | Pdfdest.Action a -> a
                    | _ -> Pdf.Null
                  in
                  let new_annot =
                    if subformat = Some Cpdfua.PDFUA2 then Pdf.add_dict_entry new_annot "/A" a else new_annot
                  in
                    Pdf.addobj_given_num pdf (annot_objnum, new_annot))
               annot_objnums
               ns2
               (flatten (many marks 2));
             let ptn = add_to_parent_tree pdf (Pdf.Array (map (fun x -> Pdf.Indirect x) ns)) in
             Pdf.addobj_given_num pdf (o, Pdf.add_dict_entry page "/StructParents" (Pdf.Integer ptn)))
          toc_pageobjnums
          toc_structure_items_per_page;
      remove_parent_tree_next_key pdf;
      if subformat = Some Cpdfua.PDFUA2 then
        (* Assume that it is just a single, indirect, top-level document.
           Either given as an indirect, or an array of one indirect. This
           assumption is ok because /P entries must have an indirect to point
           to. So if the document contains anything, the /Document structelem
           must be indirect. *)
        begin match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"] with
        | Some d ->
            if top_level_document = 0 then () else
              let obj = Pdf.lookup_obj pdf top_level_document in
              let obj' =
                let k' =
                  match Pdf.lookup_direct pdf "/K" obj with
                  | Some (Pdf.Array a) -> Pdf.Array (prepending_structitems @ a)
                  | Some (Pdf.Dictionary d) -> Pdf.Array (prepending_structitems @ [Pdf.Dictionary d])
                  | _ -> Pdf.Null
                in
                  Pdf.add_dict_entry obj "/K" k'
              in
                Pdf.addobj_given_num pdf (top_level_document, obj')
        | _ -> ()
        end
      else
        begin match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"; "/K"] with
        | Some (Pdf.Array a) ->
            Pdf.replace_chain pdf ["/Root"; "/StructTreeRoot"; "/K"] (Pdf.Array (prepending_structitems @ a))
        | Some (Pdf.Dictionary d) ->
            Pdf.replace_chain pdf ["/Root"; "/StructTreeRoot"; "/K"] (Pdf.Array (prepending_structitems @ [Pdf.Dictionary d]))
        | _ ->
            ()
        end
    end;
  let labels' = label::map (fun l -> {l with Pdfpagelabels.startpage = l.Pdfpagelabels.startpage + toc_pages_len}) labels in
    Pdfpagelabels.write pdf labels';
    if bookmark then
      let marks = Pdfmarks.read_bookmarks ~preserve_actions:true pdf in
      let refnums = Pdf.page_reference_numbers pdf in
      let newmark =
        {Pdfmarks.level = 0;
         Pdfmarks.text = Pdftext.pdfdocstring_of_utf8 (implode (real_newline (explode title)));
         Pdfmarks.target =
           if subformat = Some Cpdfua.PDFUA2 then
             let action =
               Pdf.Dictionary
                 [("/SD", Pdf.Array [Pdf.Indirect !p_struct_elem_first_page_ref; Pdf.Name "/XYZ";  Pdf.Null; Pdf.Null; Pdf.Null]);
                  ("/S", Pdf.Name "/GoTo");
                  ("/D", Pdf.Array [Pdf.Indirect (hd refnums); Pdf.Name "/XYZ"; Pdf.Null; Pdf.Null; Pdf.Null])]
             in
               Pdfdest.Action action
           else
             Pdfdest.XYZ (Pdfdest.PageObject (hd refnums), None, None, None);
         Pdfmarks.isopen = false;
         Pdfmarks.colour = (0., 0., 0.);
         Pdfmarks.flags = 0}
      in
        Pdfmarks.add_bookmarks (newmark::marks) pdf
    else
      pdf