1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
|
open Pdfutil
(* Raised when syntax is ok, but endpage is too low. Caught by validator.
Caught and reraised as normal failure by parse_pagespec. *)
exception PageSpecUnknownPage of int
(* There would be no pages *)
exception PageSpecWouldBeNoPages
(* Raised when syntax is wrong. Caught and reraised by parse_pagespec and
validator. *)
exception PageSpecBadSyntax
(* Parsing range specifications *)
let rec splitat_commas toks =
match cleavewhile (neq (Pdfgenlex.LexName ",")) toks with
| [], [] -> []
| [], _ -> raise PageSpecBadSyntax
| some, [] -> [some]
| _::_ as before, _::rest -> before::splitat_commas rest
let is_dimension pdf comparison {Pdfpage.mediabox = box} =
let minx, miny, maxx, maxy = Pdf.parse_rectangle pdf box in
comparison (maxx -. minx) (maxy -. miny)
let select_dimensions comparison pdf candidates =
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums, kept_pages =
split
(option_map
(fun (index, page) ->
if mem index candidates then Some (index, page) else None)
(combine (indx pages) pages))
in
option_map2
(fun pagenum page ->
if is_dimension pdf comparison page then Some pagenum else None)
pagenums
kept_pages
let select_portrait = select_dimensions ( < )
let select_landscape = select_dimensions ( > )
let rec mk_numbers pdf endpage lexemes =
match lexemes with
| [Pdfgenlex.LexInt n] -> [n]
| [Pdfgenlex.LexName "end"] -> [endpage]
| [Pdfgenlex.LexInt n; Pdfgenlex.LexName "-"; Pdfgenlex.LexInt n'] ->
if n > n' then rev (ilist n' n) else ilist n n'
| [Pdfgenlex.LexName "end"; Pdfgenlex.LexName "-"; Pdfgenlex.LexInt n] ->
if n <= endpage
then rev (ilist n endpage)
else raise (PageSpecUnknownPage n)
| [Pdfgenlex.LexInt n; Pdfgenlex.LexName "-"; Pdfgenlex.LexName "end"] ->
if n <= endpage
then ilist n endpage
else raise (PageSpecUnknownPage n)
| [Pdfgenlex.LexName "end"; Pdfgenlex.LexName "-"; Pdfgenlex.LexName "end"] ->
[endpage]
| [Pdfgenlex.LexName "even"] ->
drop_odds (ilist 1 endpage)
| [Pdfgenlex.LexName "portrait"] ->
select_portrait pdf (ilist 1 endpage)
| [Pdfgenlex.LexName "landscape"] ->
select_landscape pdf (ilist 1 endpage)
| [Pdfgenlex.LexName "odd"] ->
really_drop_evens (ilist 1 endpage)
| [Pdfgenlex.LexName "all"] ->
ilist 1 endpage
| [Pdfgenlex.LexName "reverse"] ->
rev (ilist 1 endpage)
| toks ->
let ranges = splitat_commas toks in
if ranges = [toks] then raise PageSpecBadSyntax else
flatten (map (mk_numbers pdf endpage) ranges)
(* Space dashes and commas *)
let rec add_spaces = function
| [] -> []
| ('-' | ',') as h::t -> ' '::h::' '::add_spaces t
| h::t -> h::add_spaces t
let space_string s =
implode (add_spaces (explode s))
let fixup_negatives endpage = function
| Pdfgenlex.LexName s when String.length s > 1 && s.[0] = '~' ->
Pdfgenlex.LexInt (endpage + 1 + ~-(int_of_string (implode (tl (explode s)))))
| x -> x
let invert_range endpage r =
option_map (fun p -> if mem p r then None else Some p) (ilist 1 endpage)
let duplicate_range n r =
flatten (map (fun x -> many x n) r)
(* e.g [1] -> 1, [iii] -> x, [/]] -> ] etc. *)
let resolve_pagelabels pdf spec =
let labels =
let labs = Pdfpagelabels.read pdf in
map
(fun pnum -> (begin try Pdfpagelabels.pagelabeltext_of_pagenumber pnum labs with Not_found -> "" end, pnum))
(ilist 1 (Pdfpage.endpage pdf))
in
(*iter (fun (s, l) -> Printf.printf "%s = %i\n" s l) labels;*)
let rec readuntilclose a t =
match t with
| ']'::t -> rev a, t
| '\\'::('[' | ']' as c)::t -> readuntilclose (c::a) t
| '['::t -> raise PageSpecBadSyntax
| x::t -> readuntilclose (x::a) t
| [] -> rev a, []
in
let rec resolve_pagelabels_inner = function
| '['::t ->
let pagelabel, rest = readuntilclose [] t in
let resolved =
explode (string_of_int (begin match lookup (implode pagelabel) labels with Some x -> x | None -> raise PageSpecBadSyntax end))
in
resolved @ resolve_pagelabels_inner rest
| '\\'::('[' | ']' as c)::t -> c::resolve_pagelabels_inner t
| ']'::t -> raise PageSpecBadSyntax
| h::t -> h::resolve_pagelabels_inner t
| [] -> []
in
resolve_pagelabels_inner spec
let rec parse_pagespec_inner endpage pdf spec =
let spec = if spec = "" then "all" else spec in
let spec = implode (resolve_pagelabels pdf (explode spec)) in
let spec = space_string spec in
if endpage < 1 then raise (Pdf.PDFError "This PDF file has no pages and is therefore malformed") else
let numbers =
try
match explode spec with
| 'N'::'O'::'T'::r ->
invert_range endpage (parse_pagespec_inner endpage pdf (implode r))
| x::'D'::'U'::'P'::r ->
duplicate_range (int_of_string (implode [x])) (parse_pagespec_inner endpage pdf (implode r))
| x::y::'D'::'U'::'P'::r ->
duplicate_range (int_of_string (implode [x; y])) (parse_pagespec_inner endpage pdf (implode r))
| x::y::z::'D'::'U'::'P'::r ->
duplicate_range (int_of_string (implode [x; y; z])) (parse_pagespec_inner endpage pdf (implode r))
| x::y::z::a::'D'::'U'::'P'::r ->
duplicate_range (int_of_string (implode [x; y; z; a])) (parse_pagespec_inner endpage pdf (implode r))
| x::y::z::a::b::'D'::'U'::'P'::r ->
duplicate_range (int_of_string (implode [x; y; z; a; b])) (parse_pagespec_inner endpage pdf (implode r))
| x::y::z::a::b::c::'D'::'U'::'P'::r ->
duplicate_range (int_of_string (implode [x; y; z; a; b; c])) (parse_pagespec_inner endpage pdf (implode r))
| _ ->
match rev (explode spec) with
| ['n'; 'e'; 'v'; 'e'] ->
keep even (ilist 1 endpage)
| ['d'; 'd'; 'o'] ->
keep odd (ilist 1 endpage)
| ['t'; 'i'; 'a'; 'r'; 't'; 'r'; 'o'; 'p'] ->
select_portrait pdf (ilist 1 endpage)
| ['e'; 'p'; 'a'; 'c'; 's'; 'd'; 'n'; 'a'; 'l'] ->
select_landscape pdf (ilist 1 endpage)
| 't'::'i'::'a'::'r'::'t'::'r'::'o'::'p'::more ->
select_portrait
pdf
(mk_numbers pdf endpage (map (fixup_negatives endpage) (Pdfgenlex.lex_string (implode (rev more)))))
| 'e'::'p'::'a'::'c'::'s'::'d'::'n'::'a'::'l'::more ->
select_landscape
pdf
(mk_numbers pdf endpage (map (fixup_negatives endpage) (Pdfgenlex.lex_string (implode (rev more)))))
| 'd'::'d'::'o'::more ->
keep
odd
(mk_numbers pdf endpage (map (fixup_negatives endpage) (Pdfgenlex.lex_string (implode (rev more)))))
| 'n'::'e'::'v'::'e'::more ->
keep
even
(mk_numbers pdf endpage (map (fixup_negatives endpage) (Pdfgenlex.lex_string (implode (rev more)))))
| _ ->
mk_numbers pdf endpage (map (fixup_negatives endpage) (Pdfgenlex.lex_string spec))
with
e -> raise PageSpecBadSyntax
in
if numbers = [] then raise PageSpecWouldBeNoPages else
iter
(fun n ->
if n <= 0 || n > endpage then raise (PageSpecUnknownPage n))
numbers;
numbers
let parse_pagespec pdf spec =
try parse_pagespec_inner (Pdfpage.endpage pdf) pdf spec with
| PageSpecUnknownPage n ->
raise (Pdf.PDFError ("Page " ^ string_of_int n ^ " does not exist."))
| PageSpecWouldBeNoPages ->
raise (Pdf.PDFError ("Page range specifies no pages"))
| e ->
raise
(Pdf.PDFError
("Bad page specification " ^ spec ^
". Raw error was " ^ Printexc.to_string e ^
". Last page was " ^ string_of_int (Pdfpage.endpage pdf)))
(* To validate a pagespec as being syntactically correct without the PDF in
question. This is nasty, since the parser above includes checking based on the
endpage of the PDF (which we don't have). Pass 100 as the endpage, doubling on
page range exception, bailng out above 500000. *)
let rec validate_pagespec_inner n spec =
try
ignore (parse_pagespec_inner n (Pdfpage.minimum_valid_pdf ()) spec); true
with
| PageSpecUnknownPage _ -> if n < 500000 then validate_pagespec_inner (n * 2) spec else false
| PageSpecBadSyntax | _ -> false
let validate_pagespec spec =
validate_pagespec_inner 100 spec
let rec parse_pagespec_without_pdf_inner n spec =
try
parse_pagespec_inner n (Pdfpage.minimum_valid_pdf ()) spec
with
PageSpecUnknownPage _ ->
if n < 500000
then parse_pagespec_without_pdf_inner (n * 2) spec
else raise (Pdf.PDFError "PageSpecUnknownPage")
let parse_pagespec_without_pdf spec =
parse_pagespec_without_pdf_inner 100 spec
(* Convert an integer list representing a set to a page specification, in order. *)
let string_of_pagespec pdf = function
| [] -> ""
| is ->
let iseven len is =
drop_odds (ilist 1 len) = is
in let isodd len is =
really_drop_evens (ilist 1 len) = is
in let isall len is =
ilist 1 len = is
in let is = sort compare is
in let len = Pdfpage.endpage pdf in
let rec mkranges prev = function
| [] -> map extremes (rev (map rev prev))
| h::t ->
match prev with
| (ph::pht)::pt when h = ph + 1 -> mkranges ((h::ph::pht)::pt) t
| (_::_)::_ -> mkranges ([h]::prev) t
| []::_ -> assert false
| [] -> mkranges [[h]] t
in
if iseven len is && len > 3 then "even" else
if isodd len is && len > 2 then "odd" else
if isall len is then "all" else
let ranges = mkranges [] is in
let rangestrings =
map
(function (s, e) ->
if s = e
then string_of_int s
else string_of_int s ^ "-" ^ string_of_int e)
ranges
in
fold_left ( ^ ) "" (interleave "," rangestrings)
(*let string_of_range r =
fold_left (fun a b -> a ^ " " ^ b) "" (map string_of_int r)*)
|