1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
|
#include "podofo.h"
#include "../PdfTest.h"
#include <iostream>
#include <stack>
#include <algorithm>
#include <string>
#include <iomanip>
#include <cstdio>
using namespace std;
using namespace PoDoFo;
static bool print_output = false;
void parse_contents( PdfContentsTokenizer* pTokenizer )
{
const char* pszToken = NULL;
PdfVariant var;
EPdfContentsType eType;
std::string str;
int numKeywords = 0;
int numVariants = 0;
std::stack<PdfVariant> stack;
while( pTokenizer->ReadNext( eType, pszToken, var ) )
{
if( eType == ePdfContentsType_Keyword )
{
++numKeywords;
if (print_output) std::cout << setw(12) << (numKeywords+numVariants)
<< " Keyword: " << pszToken << std::endl;
// support 'l' and 'm' tokens
if( strcmp( pszToken, "l" ) == 0 )
{
double dPosY = stack.top().GetReal();
stack.pop();
double dPosX = stack.top().GetReal();
stack.pop();
if(print_output) std::cout << string(12,' ') << " LineTo: " << dPosX << " " << dPosY << std::endl;
}
else if( strcmp( pszToken, "m" ) == 0 )
{
double dPosY = stack.top().GetReal();
stack.pop();
double dPosX = stack.top().GetReal();
stack.pop();
if(print_output) std::cout << string(12,' ') << " MoveTo: " << dPosX << " " << dPosY << std::endl;
}
}
else if ( eType == ePdfContentsType_Variant )
{
++numVariants;
var.ToString( str );
if(print_output) std::cout << setw(12) << (numKeywords+numVariants)
<< " Variant: " << str << std::endl;
stack.push( var );
}
else if (eType == ePdfContentsType_ImageData)
{
if (print_output) {
std::string d ( var.GetRawData().data() );
std::cout << string(13, ' ') << "Inline image data: " << d.size() << " bytes. Hex follows." << std::hex << std::endl;
std::cout << std::hex << std::setfill('0');
for ( std::string::iterator i = d.begin(); i != d.end(); i ++) {
std::cout << std::setw(2) << (static_cast<unsigned short>(*i) & 0x00FF);
}
std::cout << std::dec << std::setfill(' ') << std::endl;
}
}
else
{
// Impossible; type must be keyword or variant
PODOFO_RAISE_ERROR( ePdfError_InternalLogic );
}
}
cout << ' ' << setw(12) << numKeywords << " keywords, " << setw(12) << numVariants << " variants";
}
void parse_page( PdfMemDocument*, PdfPage* pPage )
{
PdfContentsTokenizer tokenizer( pPage );
parse_contents( &tokenizer );
}
void usage()
{
printf("Usage: ContentParser [-g] [-a] [-p] input_filename\n");
printf(" -a Process all pages of input, not just first\n");
printf(" -p Print parsed content stream to stdout\n");
}
int main( int argc, char* argv[] )
{
bool all_pages = false;
int firstPageNo = 0;
string inputFileName;
++argv;
--argc;
while (argc)
{
if( argv[0][0] == '-' )
{
// Single character flag
switch( argv[0][1] )
{
case 'a':
// Process all pages, not just first page
all_pages = true;
break;
case 'p':
// Print output, rather than parsing & checking
// silently.
print_output = true;
break;
case 'n':
// Page number request. Chars 2+ are page number int. Let's do
// this the quick and dirty way...
firstPageNo = atoi(argv[0]+2) - 1;
cerr << "Will process page: " << (firstPageNo+1) << endl;
break;
default:
usage();
return 1;
}
}
else
{
// Input filename
if (inputFileName.empty())
{
inputFileName = argv[0];
}
else
{
usage();
return 1;
}
}
++argv;
--argc;
}
if (inputFileName.empty())
{
usage();
return 1;
}
try
{
PdfMemDocument doc( inputFileName.c_str() );
if( !doc.GetPageCount() )
{
std::cerr << "This document contains no page!" << std::endl;
return 1;
}
int toPage = all_pages ? doc.GetPageCount() : firstPageNo + 1 ;
for ( int i = firstPageNo; i < toPage; ++i )
{
cout << "Processing page " << setw(6) << (i+1) << "..." << std::flush;
PdfPage* page = doc.GetPage( i );
PODOFO_RAISE_LOGIC_IF( !page, "Got null page pointer within valid page range" );
parse_page( &doc, page );
cout << " - page ok" << endl;
}
}
catch( const PdfError & e )
{
e.PrintErrorMsg();
return e.GetError();
}
cout << endl;
return 0;
}
|