1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
|
open Pdfutil
(* We allow \n in titles. Split for typesetter. *)
let rec split_toc_title_inner a = function
| '\\'::'n'::r -> rev a :: split_toc_title_inner [] r
| x::xs -> split_toc_title_inner (x::a) xs
| [] -> [rev a]
let split_toc_title = split_toc_title_inner []
(* And for new bookmark for TOC, change \\n to \n *)
let rec real_newline = function
| '\\'::'n'::r -> '\n'::real_newline r
| x::r -> x::real_newline r
| [] -> []
let width_table_cache = null_hash ()
let rec width_of_runs runs =
match runs with
| Cpdftype.Font (id, f, fontsize)::Cpdftype.Text t::more ->
let width_table =
match Hashtbl.find width_table_cache (id, fontsize) with
| w -> w
| exception Not_found ->
let ws = Cpdftype.font_widths id f fontsize in Hashtbl.add width_table_cache (id, fontsize) ws; ws
in
Cpdftype.width_of_string width_table t +. width_of_runs more
| [] -> 0.
| _ -> failwith "width_of_runs"
(* Run of Font / Text elements from a fontpack and UTF8 text *)
let of_utf8 fontpack fontsize t =
let codepoints = Pdftext.codepoints_of_utf8 t in
let fonted = option_map (Cpdfembed.get_char fontpack) codepoints in
let collated = Cpdfembed.collate_runs fonted in
flatten
(map
(function
| [] -> []
| (_, n, font) as h::t ->
let charcodes = map (fun (c, _, _) -> char_of_int c) (h::t) in
[Cpdftype.Font (string_of_int n, font, fontsize); Cpdftype.Text charcodes])
collated)
(* Cpdftype codepoints from a font and PDFDocEndoding string *)
let of_pdfdocencoding fontpack fontsize t =
of_utf8 fontpack fontsize (Pdftext.utf8_of_pdfdocstring t)
(* Remove characters until it is below the length. Then remove three more and
add dots for an ellipsis *)
let rec shorten_text_inner l t =
match rev t with
| Cpdftype.Text text::Cpdftype.Font (id, f, fs)::more ->
let width_table =
match Hashtbl.find width_table_cache (id, fs) with
| w -> w
| exception Not_found ->
let ws = Cpdftype.font_widths id f fs in Hashtbl.add width_table_cache (id, fs) ws; ws
in
if Cpdftype.width_of_string width_table text > l then
shorten_text_inner l (rev (Cpdftype.Text (all_but_last text)::Cpdftype.Font (id, f, fs)::more))
else
t
| _ -> t
let shorten_text fontpack fontsize l t =
let short = shorten_text_inner l t in
if short = t then t else
let charcode, dotfontnum, dotfont =
unopt (Cpdfembed.get_char fontpack (int_of_char '.'))
in
let charcode = char_of_int charcode in
short @ [Cpdftype.Font (string_of_int dotfontnum, dotfont, fontsize); Cpdftype.Text [charcode; charcode; charcode]]
(* Calculate the used codepoints *)
let used pdf fastrefnums labels title marks =
let codepoints = null_hash () in
Hashtbl.add codepoints (int_of_char '.') ();
let addtext t =
iter
(fun c -> Hashtbl.replace codepoints c ())
(Pdftext.codepoints_of_utf8 (Pdftext.utf8_of_pdfdocstring t))
in
iter (fun c -> Hashtbl.replace codepoints c ()) (Pdftext.codepoints_of_utf8 title);
iter
(fun m ->
addtext m.Pdfmarks.text;
let pnum = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target in
let labeltext =
try Pdfpagelabels.pagelabeltext_of_pagenumber pnum labels with Not_found -> string_of_int pnum
in
addtext labeltext)
marks;
codepoints
(* Make a dot leader *)
let make_dots space fontpack fontsize =
let dotruns = of_utf8 fontpack fontsize "." in
let dotwidth = width_of_runs dotruns in
let runs = flatten (many dotruns (int_of_float (floor (space /. dotwidth)))) in
let remainder = space -. width_of_runs runs in
[Cpdftype.HGlue remainder] @ runs
(* Prepend structure tree items. FIXME: What to do if not present? Currently we do nothing. *)
let prepend_structitems pdf items =
match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"; "/K"] with
| Some (Pdf.Array a) ->
Pdf.replace_chain pdf ["/Root"; "/StructTreeRoot"; "/K"] (Pdf.Array (items @ a))
| Some (Pdf.Dictionary d) ->
Pdf.replace_chain pdf ["/Root"; "/StructTreeRoot"; "/K"] (Pdf.Array (items @ [Pdf.Dictionary d]))
| _ -> ()
(* FIXME Would be better with a Pdf.remove_chain *)
let remove_parent_tree_next_key pdf =
match Pdf.lookup_obj pdf pdf.Pdf.root with
| Pdf.Dictionary d ->
begin match lookup "/StructTreeRoot" d with
| Some (Pdf.Indirect i) ->
Pdf.addobj_given_num pdf (i, Pdf.remove_dict_entry (Pdf.lookup_obj pdf i) "/ParentTreeNextKey")
| Some (Pdf.Dictionary d2) ->
let newstroot = Pdf.remove_dict_entry (Pdf.Dictionary d2) "/ParentTreeNextKey" in
let newroot = Pdf.add_dict_entry (Pdf.Dictionary d) "/StructTreeRoot" newstroot in
Pdf.addobj_given_num pdf (pdf.Pdf.root, newroot)
| _ -> ()
end
| _ -> ()
(* FIXME Again, replace_chain would be much better here if it could deal with a final indirect. *)
let add_to_parent_tree pdf p =
match Pdf.lookup_chain pdf (Pdf.lookup_obj pdf pdf.Pdf.root) ["/StructTreeRoot"; "/ParentTree"] with
| Some tree ->
let t = Pdftree.read_number_tree pdf tree in
let n = match t with [] -> 0 | l -> int_of_string (fst (last l)) + 1 in
let newtree = Pdftree.build_name_tree true pdf ((string_of_int n, p)::t) in
begin match Pdf.lookup_direct pdf "/StructTreeRoot" (Pdf.lookup_obj pdf pdf.Pdf.root) with
| Some (Pdf.Dictionary d) ->
begin match lookup "/ParentTree" d with
| Some (Pdf.Indirect i) ->
Pdf.addobj_given_num pdf (i, newtree)
| Some (Pdf.Dictionary d) ->
let i = Pdf.addobj pdf newtree in
Pdf.replace_chain pdf ["/Root"; "/StructTreeRoot"; "/ParentTree"] (Pdf.Indirect i)
| _ -> ()
end
| _ -> ()
end;
n
| None -> 0
(* Make sure that there is an existing structure tree suitable for us to merge
into. Check for /StructTreeRoot. If there, nothing to do. Otherwise, build
<</Type/StructTreeRoot/ParentTree .../K[]>>. ParentTree and K actually
optional, but it's easier if we assume they are there. *)
let ensure_minimal_struct_tree pdf =
match Pdf.lookup_chain pdf (Pdf.lookup_obj pdf pdf.Pdf.root) ["/StructTreeRoot"] with
| Some _ -> ()
| None ->
let pt = Pdf.addobj pdf (Pdf.Dictionary [("/Nums", Pdf.Array [])]) in
let str = Pdf.Dictionary [("/Type", Pdf.Name "/StructTreeRoot"); ("/ParentTree", Pdf.Indirect pt); ("/K", Pdf.Array [])] in
Pdf.addobj_given_num pdf (pdf.Pdf.root, (Pdf.add_dict_entry (Pdf.lookup_obj pdf pdf.Pdf.root) "/StructTreeRoot" str))
(* Typeset a table of contents with given font, font size and title. Mediabox
copied from first page of existing PDF cropbox, or mediabox if no crop box.
Margin of 10%. Font size of title twice body font size. Null page labels
added for TOC, others bumped up and so preserved. *)
(* TODO Fix Cpdftype to take a box not a papersize/margins combo. Then we can remove all the CropBox/Mediabox complications here.
Then copying the boxes directly from the first page of the document is ok, and we just prefer the cropbox. Failing file
__PDFUA/decomp/08.pdf *)
let typeset_table_of_contents ~font ~fontsize ~title ~bookmark ~dotleader ~process_struct_tree ?subformat pdf =
let optional l = if process_struct_tree then l else [] in
if process_struct_tree then ensure_minimal_struct_tree pdf;
Hashtbl.clear width_table_cache;
let marks = Pdfmarks.read_bookmarks ~preserve_actions:true pdf in
if marks = [] then (Pdfe.log "No bookmarks, not making table of contents\n"; pdf) else
let labels = Pdfpagelabels.read pdf in
let refnums = Pdf.page_reference_numbers pdf in
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
let codepoints = map fst (list_of_hashtbl (used pdf fastrefnums labels title marks)) in
let fontpack =
match font with
| Cpdfembed.PreMadeFontPack t -> t
| Cpdfembed.EmbedInfo {fontfile; fontname; encoding} ->
Cpdfembed.embed_truetype pdf ~fontfile ~fontname ~codepoints ~encoding
| Cpdfembed.ExistingNamedFont -> raise (Pdf.PDFError "Cannot use existing font with -table-of-contents")
in
let firstpage = hd (Pdfpage.pages_of_pagetree pdf) in
let width, firstpage_papersize, pmaxx, pmaxy, margin =
let width, height, xmax, ymax =
match Pdf.parse_rectangle pdf firstpage.Pdfpage.mediabox with
xmin, ymin, xmax, ymax -> xmax -. xmin, ymax -. ymin, xmax, ymax
in
width, Pdfpaper.make Pdfunits.PdfPoint width height, xmax, ymax, fmin width height *. 0.1
in
let firstpage_cropbox =
match Pdf.lookup_direct pdf "/CropBox" firstpage.Pdfpage.rest with
| Some r -> Some (Pdf.parse_rectangle pdf r)
| None -> None
in
let width =
match firstpage_cropbox with
| Some (xmin, _, xmax, _) -> xmax -. xmin
| None -> width
in
let lines =
map
(fun mark ->
let indent = float mark.Pdfmarks.level *. fontsize *. 2. in
let textruns = of_pdfdocencoding fontpack fontsize mark.Pdfmarks.text in
let labelruns =
if mark.Pdfmarks.target = NullDestination then of_pdfdocencoding fontpack fontsize "" else
let pnum = Pdfpage.pagenumber_of_target ~fastrefnums pdf mark.Pdfmarks.target in
let pde = try Pdfpagelabels.pagelabeltext_of_pagenumber pnum labels with Not_found -> string_of_int pnum in
of_pdfdocencoding fontpack fontsize pde
in
let textgap = width -. margin *. 2. -. indent -. width_of_runs labelruns in
let textruns = shorten_text fontpack fontsize (textgap -. fontsize *. 3.) textruns in
let space = textgap -. width_of_runs textruns in
let leader =
if dotleader && labelruns <> []
then make_dots space fontpack fontsize
else [Cpdftype.HGlue space]
in
[Cpdftype.BeginDest (mark.Pdfmarks.target, Some mark.Pdfmarks.text); Cpdftype.HGlue indent]
@ optional [(Cpdftype.Tag ("Link", 0))] @ textruns @ optional [Cpdftype.EndTag]
@ leader
@ optional [Cpdftype.Tag ("Link", 0)] @ labelruns @ optional [Cpdftype.EndTag]
@ [Cpdftype.EndDest; Cpdftype.NewLine])
(Pdfmarks.read_bookmarks ~preserve_actions:false pdf)
in
let toc_pages, toc_tags =
let title =
let glue = Cpdftype.VGlue (fontsize *. 2.) in
optional [Cpdftype.Tag ("P", 0)]
@ flatten
(map
(fun l -> l @ [Cpdftype.NewLine])
(map (of_utf8 fontpack (fontsize *. 2.)) (map implode (split_toc_title (explode title)))))
@
optional [Cpdftype.EndTag] @ [glue]
in
let lm, rm, bm, tm =
match firstpage_cropbox with
| None -> (margin, margin, margin, margin)
| Some (cminx, cminy, cmaxx, cmaxy) ->
(cminx +. margin, (pmaxx -. cmaxx) +. margin, cminy +. margin, (pmaxy -. cmaxy) +. margin)
in
let firstfont =
hd (keep (function Cpdftype.Font _ -> true | _ -> false) (title @ flatten lines))
in
Cpdftype.typeset ~process_struct_tree lm rm tm bm firstpage_papersize pdf
([firstfont; Cpdftype.BeginDocument] @ title @ flatten lines)
in
let toc_pages =
match firstpage_cropbox with
| Some (a, b, c, d) ->
let rect =
Pdf.Array [Pdf.Real a; Pdf.Real b; Pdf.Real c; Pdf.Real d]
in
map
(fun p -> {p with Pdfpage.rest = Pdf.add_dict_entry p.Pdfpage.rest "/CropBox" rect})
toc_pages
| None -> toc_pages
in
let original_pages = Pdfpage.pages_of_pagetree pdf in
let toc_pages_len = length toc_pages in
let changes = map (fun n -> (n, n + toc_pages_len)) (indx original_pages) in
let pdf = Pdfpage.change_pages ~changes true pdf (toc_pages @ original_pages) in
let toc_pageobjnums = take (Pdf.page_reference_numbers pdf) toc_pages_len in
let label =
{Pdfpagelabels.labelstyle = NoLabelPrefixOnly;
Pdfpagelabels.labelprefix = None;
Pdfpagelabels.startpage = 1;
Pdfpagelabels.startvalue = 1}
in
(* Get indirect of top-level /Document *)
let top_level_document =
match subformat with Some Cpdfua.PDFUA2 ->
begin match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"] with
| Some d ->
begin match Pdf.lookup_immediate "/K" d with
| Some (Pdf.Indirect i) -> i
| Some (Pdf.Array [Pdf.Indirect i]) -> i
| _ -> 0
end
| _ -> 0
end
| _ -> 0
in
let p_struct_elem_first_page_ref = ref 0 in
if process_struct_tree then
begin
let struct_tree_root =
if top_level_document > 0 then top_level_document else
match Pdf.lookup_immediate "/StructTreeRoot" (Pdf.lookup_obj pdf pdf.Pdf.root) with
| Some (Pdf.Indirect i) -> i
| _ -> 0 (* Will never happen, because we ran ensure_minimal_struct_tree *)
in
let p_struct_elem_first_page =
Pdf.addobj pdf
(Pdf.Dictionary [("/S", Pdf.Name "/P");
("/Pg", Pdf.Indirect (hd toc_pageobjnums));
("/K", Pdf.Array [Pdf.Integer 0]);
("/P", Pdf.Indirect struct_tree_root)])
in
p_struct_elem_first_page_ref := p_struct_elem_first_page;
let mcid = ref 1 in
let link_struct_elems_for_each_page =
map2
(fun page pageobjnum ->
let annot_objnums =
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array a) -> map (function Pdf.Indirect i -> i | _ -> 0) a
| _ -> []
in
let r = map
(fun annot_i ->
let r =
let objr = Pdf.addobj pdf (Pdf.Dictionary [("/Type", Pdf.Name "/OBJR"); ("/Obj", Pdf.Indirect annot_i)]) in
Pdf.addobj pdf
(Pdf.Dictionary [("/S", Pdf.Name "/Link");
("/K", Pdf.Array [Pdf.Integer !mcid; Pdf.Indirect objr]);
("/P", Pdf.Indirect struct_tree_root);
("/Pg", Pdf.Indirect pageobjnum)])
in
incr mcid; r)
annot_objnums
in
mcid := 0; r)
toc_pages
toc_pageobjnums
in
let prepending_structitems =
map (fun x -> Pdf.Indirect x) (p_struct_elem_first_page::flatten link_struct_elems_for_each_page)
in
(* Add the key and value structure item (any p, and that page's links) to the parent tree for each TOC page *)
let toc_structure_items_per_page =
match link_struct_elems_for_each_page with
| h::t -> (p_struct_elem_first_page::h)::t
| [] -> []
in
iter2
(fun o ns ->
let page = Pdf.lookup_obj pdf o in
(* For each annotation, add a structparent entry too. *)
let annot_objnums =
match Pdf.lookup_direct pdf "/Annots" page with
| Some (Pdf.Array a) -> map (function Pdf.Indirect i -> i | _ -> 0) a
| _ -> []
in
(* Remove the Title P from first page list *)
let ns2 = if length ns > length annot_objnums then tl ns else ns in
iter3
(fun annot_objnum n mark ->
let annot = Pdf.lookup_obj pdf annot_objnum in
let sp_num = add_to_parent_tree pdf (Pdf.Indirect n) in
let new_annot = Pdf.add_dict_entry annot "/StructParent" (Pdf.Integer sp_num) in
let a =
match mark.Pdfmarks.target with
| Pdfdest.Action a -> a
| _ -> Pdf.Null
in
let new_annot =
if subformat = Some Cpdfua.PDFUA2 then Pdf.add_dict_entry new_annot "/A" a else new_annot
in
Pdf.addobj_given_num pdf (annot_objnum, new_annot))
annot_objnums
ns2
(flatten (many marks 2));
let ptn = add_to_parent_tree pdf (Pdf.Array (map (fun x -> Pdf.Indirect x) ns)) in
Pdf.addobj_given_num pdf (o, Pdf.add_dict_entry page "/StructParents" (Pdf.Integer ptn)))
toc_pageobjnums
toc_structure_items_per_page;
remove_parent_tree_next_key pdf;
if subformat = Some Cpdfua.PDFUA2 then
(* Assume that it is just a single, indirect, top-level document.
Either given as an indirect, or an array of one indirect. This
assumption is ok because /P entries must have an indirect to point
to. So if the document contains anything, the /Document structelem
must be indirect. *)
begin match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"] with
| Some d ->
if top_level_document = 0 then () else
let obj = Pdf.lookup_obj pdf top_level_document in
let obj' =
let k' =
match Pdf.lookup_direct pdf "/K" obj with
| Some (Pdf.Array a) -> Pdf.Array (prepending_structitems @ a)
| Some (Pdf.Dictionary d) -> Pdf.Array (prepending_structitems @ [Pdf.Dictionary d])
| _ -> Pdf.Null
in
Pdf.add_dict_entry obj "/K" k'
in
Pdf.addobj_given_num pdf (top_level_document, obj')
| _ -> ()
end
else
begin match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"; "/K"] with
| Some (Pdf.Array a) ->
Pdf.replace_chain pdf ["/Root"; "/StructTreeRoot"; "/K"] (Pdf.Array (prepending_structitems @ a))
| Some (Pdf.Dictionary d) ->
Pdf.replace_chain pdf ["/Root"; "/StructTreeRoot"; "/K"] (Pdf.Array (prepending_structitems @ [Pdf.Dictionary d]))
| _ ->
()
end
end;
let labels' = label::map (fun l -> {l with Pdfpagelabels.startpage = l.Pdfpagelabels.startpage + toc_pages_len}) labels in
Pdfpagelabels.write pdf labels';
if bookmark then
let marks = Pdfmarks.read_bookmarks ~preserve_actions:true pdf in
let refnums = Pdf.page_reference_numbers pdf in
let newmark =
{Pdfmarks.level = 0;
Pdfmarks.text = Pdftext.pdfdocstring_of_utf8 (implode (real_newline (explode title)));
Pdfmarks.target =
if subformat = Some Cpdfua.PDFUA2 then
let action =
Pdf.Dictionary
[("/SD", Pdf.Array [Pdf.Indirect !p_struct_elem_first_page_ref; Pdf.Name "/XYZ"; Pdf.Null; Pdf.Null; Pdf.Null]);
("/S", Pdf.Name "/GoTo");
("/D", Pdf.Array [Pdf.Indirect (hd refnums); Pdf.Name "/XYZ"; Pdf.Null; Pdf.Null; Pdf.Null])]
in
Pdfdest.Action action
else
Pdfdest.XYZ (Pdfdest.PageObject (hd refnums), None, None, None);
Pdfmarks.isopen = false;
Pdfmarks.colour = (0., 0., 0.);
Pdfmarks.flags = 0}
in
Pdfmarks.add_bookmarks (newmark::marks) pdf
else
pdf
|