File: hocr2pdf.cc

package info (click to toggle)
exactimage 1.2.1-3
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 3,048 kB
  • sloc: cpp: 35,940; ansic: 1,952; xml: 1,447; makefile: 338; perl: 138; sh: 110; python: 45; php: 37; ruby: 12
file content (152 lines) | stat: -rw-r--r-- 4,544 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/*
 * The ExactImage library's hOCR to PDF command line frontend
 * Copyright (C) 2008 - 2023 René Rebe, ExactCODE GmbH Germany
 * Copyright (C) 2008 Archivista
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2. A copy of the GNU General
 * Public License can be found in the file LICENSE.
 * 
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANT-
 * ABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
 * Public License for more details.
 *
 * Alternatively, commercial licensing options are available from the
 * copyright holder ExactCODE GmbH Germany.
 */

#include <string.h>

#include <iostream>
#include <fstream>
#include <iomanip>
#include <cmath>
#include <cctype>
#include <vector>

#include "ArgumentList.hh"

#include "config.h"

#include "Codecs.hh"
#include "pdf.hh"
#include "hocr.hh"

using namespace Utility;

int main(int argc, char* argv[])
{
  ArgumentList arglist(false);
  
  // setup the argument list
  Argument<bool> arg_help("h", "help",
			  "display this help text and exit");
  arglist.Add(&arg_help);
  
  Argument<std::string> arg_input("i", "input",
				  "input image filename",
				  1, 1, true, true);
  arglist.Add(&arg_input);

  Argument<std::string> arg_output("o", "output",
				   "output PDF filename",
				   1, 1, true, true);
  arglist.Add(&arg_output);
  
  Argument<int> arg_resolution("r", "resolution",
			       "resolution overwrite",
			       0, 1, true, true);
  arglist.Add(&arg_resolution);
  
  Argument<bool> arg_no_image("n", "no-image",
			      "do not place the image over the text",
			      0, 0, true, true);
  arglist.Add(&arg_no_image);

  Argument<bool> arg_sloppy_text("s", "sloppy-text",
				 "sloppily place text, group words, do not draw single glyphs",
				 0, 0, true, true);
  arglist.Add(&arg_sloppy_text);

  Argument<std::string> arg_text("t", "text",
				 "extract text, including trying to remove hyphens",
				 0, 1, true, true);
  arglist.Add(&arg_text);
  
  Argument<int> arg_quality ("", "quality",
			     "quality setting used for writing compressed images\n\t\t"
			     "integer range 0-100, the default is 75",
			     0, 1, true, true);
  arglist.Add(&arg_quality);
  
  Argument<std::string> arg_compression ("", "compress",
					 "compression method for writing images e.g. ascii85, hex, flate,\n"
					 "\t\tjpeg, jpeg2000 ... auto default based on bit-depth",
					 0, 1, true, true);
  arglist.Add(&arg_compression);
  

  // parse the specified argument list - and maybe output the Usage
  if (!arglist.Read(argc, argv) || arg_help.Get() == true)
    {
      std::cerr << "ExactImage hOCR to PDF converter, version " VERSION << std::endl
		<< "Copyright (C) 2008 - 2023 René Rebe, ExactCODE GmbH" << std::endl
		<< "Copyright (C) 2008 Archivista" << std::endl
		<< "Usage:" << std::endl;
      
      arglist.Usage(std::cerr);
      return 1;
    }

  // load the image, if specified and possible
  
  Image image; image.w = image.h = 0;
  if (arg_input.Size())
    {
      if (!ImageCodec::Read(arg_input.Get(), image)) {
	std::cerr << "Error reading: " << arg_input.Get() << std::endl;
	return 1;
      }
    }
  
  if (arg_resolution.Size())
    image.setResolution(arg_resolution.Get(), arg_resolution.Get());
  if (image.resolutionX() <= 0 || image.resolutionY() <= 0) {
    std::cerr << "Warning: Image x/y resolution not set, defaulting to: "
	      << 300 << std::endl;
    image.setResolution(300, 300);
  }
  unsigned int res = image.resolutionX();
  bool sloppy = arg_sloppy_text.Get();
  
  std::ofstream* txtStream = 0;
  if (arg_text.Size()) {
    txtStream = new std::ofstream(arg_text.Get().c_str());
  }
  
  std::ofstream s(arg_output.Get().c_str());
  PDFCodec* pdfContext = new PDFCodec(&s);
  pdfContext->beginPage(72. * image.w / res, 72. * image.h / res);
  pdfContext->setFillColor(0, 0, 0);
  
  hocr2pdf(std::cin, pdfContext, res, sloppy, txtStream);

  int quality = 75;
  if (arg_quality.Size())
    quality = arg_quality.Get();
  std::string compression = "";
  if (arg_compression.Size())
    compression = arg_compression.Get();
  
  if (!arg_no_image.Get())
    pdfContext->showImage(image, 0, 0, 72. * image.w / res, 72. * image.h / res, quality, compression);
  
  delete pdfContext;
  if (txtStream) {
    txtStream->close();
    delete txtStream;
  }
  return 0;
}