File: PDF.cc

package info (click to toggle)
derivations 0.56.20180123.1-2
  • links: PTS
  • area: main
  • in suites: bullseye, buster, sid
  • size: 2,388 kB
  • sloc: cpp: 1,633; perl: 692; makefile: 158; sh: 153
file content (233 lines) | stat: -rw-r--r-- 6,711 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233

#include "PDF.h"
#include <sys/stat.h>
#include <fstream>
#include <string>
#include <cctype>
#include "PDF_rep.h"

int PDF::file_length( const PDF &pdf ) {
  return pdf.rep->file_length1;
}

int PDF::offset_last_xref_table( const std::string &pdf_filename ) {

  // Update in 2015, seven years after most of the rest of the file and
  // program were written:
  //
  // In an earlier version of the progam, this function's body was
  // a one-liner:
  //
  //   return pdf.rep->xref->getLastXRefPos();
  //
  // That worked fine until Libpoppler changed its interface, since
  // which it has hidden the required function in the private section
  // of an interface, unusable here.  In a later version of the
  // program, this function's body was a different one-liner:
  //
  //   return pdf.rep->xref->getEntry(0)->offset;
  //
  // This unfortunately does the wrong thing, though, with effects
  // Salvatore Bonaccorso has noticed and brought to attention.
  // Accordingly, this function itself must now find the position of
  // the last XRef table, as follows.
  //
  // Fortunately, the PDF standard requires the position of an XRef
  // table to be given in plain ascii, so finding the position is not
  // too hard. One must only be sure to find the position of
  // the *last* XRef table.
  //
  // The code is not quite as elegant as it might be, but the whole
  // program needs cleaning up, so let us not worry about that for now.
  // (The programmer's C++ style was pretty immature back in 2008 in
  // any case.)
  //
  //

  const char key_token[] = "startxref";

  int offset = -1;

  std::ifstream pdf_file(pdf_filename);
  bool has_preceding_whitespace = true;
  char digit_stage[] = " ";
  int c = std::ifstream::traits_type::eof();
  const char *p = key_token;

  while (true) {

    c = pdf_file.get();
    if (c == std::ifstream::traits_type::eof()) goto done;

    if (!has_preceding_whitespace || c != *p) {
      p = key_token;
      has_preceding_whitespace = std::isspace(c);
    }

    else {

      ++p;

      if (!*p) {

        // Skip whitespace between key token and offset.
        bool has_trailing_whitespace = false;
        while (true) {
          c = pdf_file.get();
          if (c == std::ifstream::traits_type::eof()) goto done;
          if (!std::isspace(c)) break;
          has_trailing_whitespace = true;
        }

        if (has_trailing_whitespace) {

          // The key token has been found, so prepare to read the offset.
          offset = -1;

          // Read the offset.
          if (std::isdigit(c)) {
            digit_stage[0] = c;
            offset = std::atoi(digit_stage);
            while (true) {
              c = pdf_file.get();
              if (c == std::ifstream::traits_type::eof()) goto done;
              if (!std::isdigit(c)) break;
              offset *= 10;
              digit_stage[0] = c;
              offset += std::atoi(digit_stage);
            }
          }

        }

        p = key_token;

      }

    }

  }

  done: return offset;

}

PDF::Iref PDF::iref_catalog( const PDF &pdf ) {
  XRef *const xref = pdf.rep->xref;
  return Iref( xref->getRootNum(), xref->getRootGen() );
}

PDF::Iref PDF::iref_info( const PDF &pdf ) {
  return pdf.rep->info_iref;
}

int PDF::n_obj( const PDF &pdf ) {
  return pdf.rep->n_obj1;
}

int PDF::offset( const PDF &pdf, const int i ) {
  return pdf.rep->xref->getEntry(i)->offset;
}

int PDF::n_page( const PDF &pdf ) {
  return pdf.rep->catalog2->getNumPages();
}

int PDF::i_page(
  const PDF &pdf,
  const Iref iref,
  const bool do_not_throw
) {
  const int i = pdf.rep->catalog2->findPage( iref.i, iref.gen );
  if (!do_not_throw && !i) throw PDF::Exc_PDF();
  return i;
}

PDF::Iref PDF::iref_page( const PDF &pdf, const int i ) {
  const Ref *const rp = pdf.rep->catalog2->getPageRef(i);
  if (!rp) throw PDF::Exc_PDF();
  return Iref( rp->num, rp->gen );
}

// The programmer does not feel sure that a little memory is not leaking
// here.  The amount of memory in question is small, and of course the
// system reclaims leaked memory at execution's end, anyway, so the leak
// if any is not serious; but even if not serious, leaking still is not
// neat.  The only documentation for Libpoppler appears to consist of
// its development headers, which seem insufficiently informative in the
// matter.  For these reasons, where in doubt, rather than risking
// improper deallocation, the code leaks.

PDF::PDF::PDF( const std::string &filename_pdf )
  : rep( new PDF_rep() )
{
  {
    struct stat s;
    if ( stat( filename_pdf.c_str(), &s ) ) throw Exc_IO();
    rep->file_length1 = s.st_size;
  }
  {
    GooString gs( filename_pdf.c_str() );
    rep->pdfdoc = new PDFDoc(&gs);
    if ( !rep->pdfdoc->isOk() ) throw Exc_IO();
  }
  {
    rep->xref = rep->pdfdoc->getXRef();
    if ( !rep->xref->isOk() ) throw Exc_PDF();
  }
  {
    Object *const obj = rep->xref->getTrailerDict();
    if ( !obj->isDict() ) throw Exc_PDF();
    rep->trailer = obj->getDict();
  }
  {
    Object obj = rep->trailer->lookup( "Size", 0 );
    if ( !obj.isInt() ) throw Exc_PDF();
    rep->n_obj1 = obj.getInt();
  }
  {
    rep->catalog_obj = new Object( rep->xref->getCatalog() );
    if ( !rep->catalog_obj->isDict() ) throw Exc_PDF();
    rep->catalog = rep->catalog_obj->getDict();
  }
  {
    rep->catalog2 = rep->pdfdoc->getCatalog();
    if ( !rep->catalog2->isOk() ) throw Exc_PDF();
  }
  {
    Object obj( rep->trailer->lookupNF( "Info"  ) );
    if ( !obj.isRef() ) throw Exc_PDF();
    const Ref ref = obj.getRef();
    rep->info_iref = Iref( ref.num, ref.gen );
  }
  {
    rep->info_obj = new Object(
      rep->xref->fetch( rep->info_iref.i, rep->info_iref.gen, 0 )
    );
    if ( !rep->info_obj->isDict() ) throw Exc_PDF();
    rep->info = rep->info_obj->getDict();
  }
}

PDF::PDF::~PDF() {
  delete rep->catalog_obj;
  delete rep->info_obj;
  // For reasons this programmer does not understand, the Libpoppler PDFDoc
  // object does not seem to deallocate gracefully.  It is allowed to leak for
  // this reason.
  //delete rep->pdfdoc;
  delete rep;
}

PDF::PDF_rep *PDF::PDF::get_PDF_rep( const int magic ) {
  // The function demands a magic integer precisely to discourage
  // callers from calling it, and conversely to prevent it from
  // returning disruptive information to unwitting callers.  The integer
  // serves no other purpose.  Its value is not elsewhere documented.
  // If you must call this function, then supply the integer.  (The
  // integer's value has 1s in the zeroth and fifteenth bits, with six
  // more 1s scattered randomly across the fourteen places between.  It
  // has no significance.)
  return magic == 0x9f05 ? rep : 0;
}