File: cpdfpagespec.ml

package info (click to toggle)
cpdf 2.9-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 7,140 kB
  • sloc: ml: 35,825; makefile: 66; sh: 49
file content (245 lines) | stat: -rw-r--r-- 9,834 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
open Pdfutil

(* Raised when syntax is wrong. Caught and reraised by parse_pagespec and
validator. *)
exception PageSpecBadSyntax

(* Parsing range specifications *)
let rec splitat_commas toks =
  match cleavewhile (neq (Pdfgenlex.LexName ",")) toks with
  | [], [] -> []
  | [], _ -> raise PageSpecBadSyntax
  | some, [] -> [some]
  | _::_ as before, _::rest -> before::splitat_commas rest 

let is_dimension pdf comparison {Pdfpage.mediabox = box} =
  let minx, miny, maxx, maxy = Pdf.parse_rectangle pdf box in
    comparison (maxx -. minx) (maxy -. miny)

let select_dimensions comparison pdf candidates =
  let pages = Pdfpage.pages_of_pagetree pdf in
    let pagenums, kept_pages =
      split
        (option_map
           (fun (index, page) ->
             if mem index candidates then Some (index, page) else None)
           (combine (indx pages) pages))
    in
      option_map2
        (fun pagenum page ->
          if is_dimension pdf comparison page then Some pagenum else None)
        pagenums
        kept_pages

let select_portrait = select_dimensions ( < )

let select_landscape = select_dimensions ( > )

let select_annotated pdf range =
 let pages = Pdfpage.pages_of_pagetree pdf in
 let num_annots page =
   match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
   | Some (Pdf.Array a) -> length a
   | _ -> 0
 in
   option_map (fun n -> if num_annots (select n pages) > 0 then Some n else None) range

let rec mk_numbers pdf endpage lexemes =
  match lexemes with
  | [Pdfgenlex.LexInt n] -> [n]
  | [Pdfgenlex.LexName "end"] -> [endpage]
  | [Pdfgenlex.LexInt n; Pdfgenlex.LexName "-"; Pdfgenlex.LexInt n'] ->
      if n > n' then rev (ilist n' n) else ilist n n'
  | [Pdfgenlex.LexName "end"; Pdfgenlex.LexName "-"; Pdfgenlex.LexInt n] ->
      if n <= endpage
        then rev (ilist n endpage)
        else [endpage]
  | [Pdfgenlex.LexInt n; Pdfgenlex.LexName "-"; Pdfgenlex.LexName "end"] ->
      if n <= endpage
        then ilist n endpage
        else [endpage]
  | [Pdfgenlex.LexName "end"; Pdfgenlex.LexName "-"; Pdfgenlex.LexName "end"] ->
       [endpage]
  | [Pdfgenlex.LexName "even"] ->
       drop_odds (ilist 1 endpage)
  | [Pdfgenlex.LexName "portrait"] ->
       select_portrait pdf (ilist 1 endpage)
  | [Pdfgenlex.LexName "landscape"] ->
       select_landscape pdf (ilist 1 endpage)
  | [Pdfgenlex.LexName "annotated"] ->
       select_annotated pdf (ilist 1 endpage)
  | [Pdfgenlex.LexName "odd"] ->
       really_drop_evens (ilist 1 endpage)
  | [Pdfgenlex.LexName "all"] ->
       ilist 1 endpage
  | [Pdfgenlex.LexName "reverse"] ->
       rev (ilist 1 endpage)
  | [Pdfgenlex.LexName "empty"] -> []
  | toks ->
      let ranges = splitat_commas toks in
        if ranges = [toks] then raise PageSpecBadSyntax else
          flatten (map (mk_numbers pdf endpage) ranges)

(* Space dashes and commas *)
let rec add_spaces = function
  | [] -> []
  | ('-' | ',') as h::t -> ' '::h::' '::add_spaces t
  | h::t -> h::add_spaces t

let space_string s =
  implode (add_spaces (explode s))

let fixup_negatives endpage = function
  | Pdfgenlex.LexName s when String.length s > 1 && s.[0] = '~' ->
      Pdfgenlex.LexInt (endpage + 1 + ~-(int_of_string (implode (tl (explode s)))))
  | x -> x

let invert_range endpage r =
  option_map (fun p -> if mem p r then None else Some p) (ilist 1 endpage)

let duplicate_range n r =
  flatten (map (fun x -> many x n) r)

(* e.g [1] -> 1, [iii] -> x, [/]] -> ] etc. *)
let resolve_pagelabels pdf spec =
  let labels =
    let labs = Pdfpagelabels.read pdf in
      map
        (fun pnum -> (begin try Pdfpagelabels.pagelabeltext_of_pagenumber pnum labs with Not_found -> "" end, pnum))
        (ilist 1 (Pdfpage.endpage pdf))
  in
  (*iter (fun (s, l) -> Printf.printf "%s = %i\n" s l) labels;*)
  let rec readuntilclose a t =
    match t with
    | ']'::t -> rev a, t
    | '\\'::('[' | ']' as c)::t -> readuntilclose (c::a) t
    | '['::t -> raise PageSpecBadSyntax
    | x::t -> readuntilclose (x::a) t
    | [] -> rev a, []
  in
  let rec resolve_pagelabels_inner = function
    | '['::t ->
        let pagelabel, rest = readuntilclose [] t in
        let resolved =
          explode (string_of_int (begin match lookup (implode pagelabel) labels with Some x -> x | None -> raise PageSpecBadSyntax end))
        in
          resolved @ resolve_pagelabels_inner rest
    | '\\'::('[' | ']' as c)::t -> c::resolve_pagelabels_inner t
    | ']'::t -> raise PageSpecBadSyntax
    | h::t -> h::resolve_pagelabels_inner t
    | [] -> []
  in
    resolve_pagelabels_inner spec

let rec parse_pagespec_inner endpage pdf spec =
  let spec = if spec = "" then "all" else spec in (* Required for DUP to work. *)
  let spec = implode (resolve_pagelabels pdf (explode spec)) in
  let spec = space_string spec in
    if endpage < 1 then raise (Pdf.PDFError "This PDF file has no pages and is therefore malformed") else
      let numbers =
        try
          match explode spec with
          | 'N'::'O'::'T'::r ->
              invert_range endpage (parse_pagespec_inner endpage pdf (implode r))
          | x::'D'::'U'::'P'::r ->
              duplicate_range (int_of_string (implode [x])) (parse_pagespec_inner endpage pdf (implode r))
          | x::y::'D'::'U'::'P'::r ->
              duplicate_range (int_of_string (implode [x; y])) (parse_pagespec_inner endpage pdf (implode r))
          | x::y::z::'D'::'U'::'P'::r ->
              duplicate_range (int_of_string (implode [x; y; z])) (parse_pagespec_inner endpage pdf (implode r))
          | x::y::z::a::'D'::'U'::'P'::r ->
              duplicate_range (int_of_string (implode [x; y; z; a])) (parse_pagespec_inner endpage pdf (implode r))
          | x::y::z::a::b::'D'::'U'::'P'::r ->
              duplicate_range (int_of_string (implode [x; y; z; a; b])) (parse_pagespec_inner endpage pdf (implode r))
          | x::y::z::a::b::c::'D'::'U'::'P'::r ->
              duplicate_range (int_of_string (implode [x; y; z; a; b; c])) (parse_pagespec_inner endpage pdf (implode r))
          | _ ->
            match rev (explode spec) with
            | ['n'; 'e'; 'v'; 'e'] ->
                keep even (ilist 1 endpage)
            | ['d'; 'd'; 'o'] ->
                keep odd (ilist 1 endpage)
            | ['t'; 'i'; 'a'; 'r'; 't'; 'r'; 'o'; 'p'] ->
                select_portrait pdf (ilist 1 endpage)
            | ['e'; 'p'; 'a'; 'c'; 's'; 'd'; 'n'; 'a'; 'l'] ->
                select_landscape pdf (ilist 1 endpage)
            | 't'::'i'::'a'::'r'::'t'::'r'::'o'::'p'::more ->
                select_portrait
                  pdf
                  (mk_numbers pdf endpage (map (fixup_negatives endpage) (Pdfgenlex.lex_string (implode (rev more)))))
            | 'e'::'p'::'a'::'c'::'s'::'d'::'n'::'a'::'l'::more ->
                select_landscape
                  pdf
                  (mk_numbers pdf endpage (map (fixup_negatives endpage) (Pdfgenlex.lex_string (implode (rev more)))))
            | 'd'::'d'::'o'::more ->
                keep
                  odd
                  (mk_numbers pdf endpage (map (fixup_negatives endpage) (Pdfgenlex.lex_string (implode (rev more)))))
            | 'n'::'e'::'v'::'e'::more ->
                keep
                  even
                  (mk_numbers pdf endpage (map (fixup_negatives endpage) (Pdfgenlex.lex_string (implode (rev more)))))
            | _ ->
                mk_numbers pdf endpage (map (fixup_negatives endpage) (Pdfgenlex.lex_string spec))
        with
          e -> raise PageSpecBadSyntax
      in
        if numbers = [] then Pdfe.log "Warning: empty page range\n";
        let numbers' = lose (fun n -> n <= 0 || n > endpage) numbers in
          if length numbers' <> length numbers then
            begin
              Pdfe.log "Warning: page range contains nonexistant pages: ";
              iter (fun x -> Pdfe.log (string_of_int x ^ " ")) (keep (fun n -> n <= 0 || n > endpage) numbers);
              Pdfe.log "\n"
            end;
          numbers'

let parse_pagespec pdf spec =
  try parse_pagespec_inner (Pdfpage.endpage pdf) pdf spec with
  | e ->
     raise
       (Pdf.PDFError
         ("Bad page specification " ^ spec ^
          ". Raw error was " ^ Printexc.to_string e ^
          ". Last page was " ^ string_of_int (Pdfpage.endpage pdf)))

let rec parse_pagespec_without_pdf spec =
  parse_pagespec_inner 500000 (Pdfpage.minimum_valid_pdf ()) spec

(* Convert an integer list representing a set to a page specification, in order. *)
let string_of_pagespec pdf = function
  | [] -> ""
  | is ->
      let iseven len is =
        drop_odds (ilist 1 len) = is
      in let isodd len is =
        really_drop_evens (ilist 1 len) = is
      in let isall len is =
        ilist 1 len = is
      in let is = sort compare is
      in let len = Pdfpage.endpage pdf in
        let rec mkranges prev = function
        | [] -> map extremes (rev (map rev prev))
        | h::t ->
            match prev with
            | (ph::pht)::pt when h = ph + 1 -> mkranges ((h::ph::pht)::pt) t
            | (_::_)::_ -> mkranges ([h]::prev) t
            | []::_ -> assert false
            | [] -> mkranges [[h]] t
        in
          if iseven len is && len > 3 then "even" else
            if isodd len is && len > 2 then "odd" else
              if isall len is then "all" else
                let ranges = mkranges [] is in
                  let rangestrings =
                    map
                      (function (s, e) ->
                         if s = e
                           then string_of_int s
                           else string_of_int s ^ "-" ^ string_of_int e)
                      ranges
                  in
                    fold_left ( ^ ) "" (interleave "," rangestrings)

(*let string_of_range r =
  fold_left (fun a b -> a ^ " " ^ b) "" (map string_of_int r)*)