1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
|
#!/usr/bin/env ruby
# coding: utf-8
# This demonstrates a way to extract some images (those based on the JPG or
# TIFF formats) from a PDF. There are other ways to store images, so
# it may need to be expanded for real world usage, but it should serve
# as a good guide.
#
# Thanks to Jack Rusher for the initial version of this example.
require 'pdf/reader'
module ExtractImages
class Extractor
def page(page)
process_page(page, 0)
end
private
def complete_refs
@complete_refs ||= {}
end
def process_page(page, count)
xobjects = page.xobjects
return count if xobjects.empty?
xobjects.each do |name, stream|
case stream.hash[:Subtype]
when :Image then
count += 1
case stream.hash[:Filter]
when :CCITTFaxDecode then
ExtractImages::Tiff.new(stream).save("#{page.number}-#{count}-#{name}.tif")
when :DCTDecode then
ExtractImages::Jpg.new(stream).save("#{page.number}-#{count}-#{name}.jpg")
else
ExtractImages::Raw.new(stream).save("#{page.number}-#{count}-#{name}.tif")
end
when :Form then
count = process_page(PDF::Reader::FormXObject.new(page, stream), count)
end
end
count
end
end
class Raw
attr_reader :stream
def initialize(stream)
@stream = stream
end
def save(filename)
case @stream.hash[:ColorSpace]
when :DeviceCMYK then save_cmyk(filename)
when :DeviceGray then save_gray(filename)
when :DeviceRGB then save_rgb(filename)
else
$stderr.puts "unsupport color depth #{@stream.hash[:ColorSpace]} #{filename}"
end
end
private
def save_cmyk(filename)
h = stream.hash[:Height]
w = stream.hash[:Width]
bpc = stream.hash[:BitsPerComponent]
len = stream.hash[:Length]
puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}"
# Synthesize a TIFF header
long_tag = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) }
short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) }
# header = byte order, version magic, offset of directory, directory count,
# followed by a series of tags containing metadata.
tag_count = 10
header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs")
tiff = header.dup
tiff << short_tag.call( 256, 1, w ) # image width
tiff << short_tag.call( 257, 1, h ) # image height
tiff << long_tag.call( 258, 4, (header.size + (tag_count*12) + 4)) # bits per pixel
tiff << short_tag.call( 259, 1, 1 ) # compression
tiff << short_tag.call( 262, 1, 5 ) # colorspace - separation
tiff << long_tag.call( 273, 1, (10 + (tag_count*12) + 20) ) # data offset
tiff << short_tag.call( 277, 1, 4 ) # samples per pixel
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
tiff << short_tag.call( 284, 1, 1 ) # planer config
tiff << long_tag.call( 332, 1, 1) # inkset - CMYK
tiff << [0].pack("I") # next IFD pointer
tiff << [bpc, bpc, bpc, bpc].pack("IIII")
tiff << stream.unfiltered_data
File.open(filename, "wb") { |file| file.write tiff }
end
def save_gray(filename)
h = stream.hash[:Height]
w = stream.hash[:Width]
bpc = stream.hash[:BitsPerComponent]
len = stream.hash[:Length]
puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}"
# Synthesize a TIFF header
long_tag = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) }
short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) }
# header = byte order, version magic, offset of directory, directory count,
# followed by a series of tags containing metadata.
tag_count = 9
header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs")
tiff = header.dup
tiff << short_tag.call( 256, 1, w ) # image width
tiff << short_tag.call( 257, 1, h ) # image height
tiff << short_tag.call( 258, 1, 8 ) # bits per pixel
tiff << short_tag.call( 259, 1, 1 ) # compression
tiff << short_tag.call( 262, 1, 1 ) # colorspace - grayscale
tiff << long_tag.call( 273, 1, (10 + (tag_count*12) + 4) ) # data offset
tiff << short_tag.call( 277, 1, 1 ) # samples per pixel
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
tiff << short_tag.call( 284, 1, 1 ) # planer config
tiff << [0].pack("I") # next IFD pointer
p stream.unfiltered_data.size
tiff << stream.unfiltered_data
File.open(filename, "wb") { |file| file.write tiff }
end
def save_rgb(filename)
h = stream.hash[:Height]
w = stream.hash[:Width]
bpc = stream.hash[:BitsPerComponent]
len = stream.hash[:Length]
puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}"
# Synthesize a TIFF header
long_tag = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) }
short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) }
# header = byte order, version magic, offset of directory, directory count,
# followed by a series of tags containing metadata.
tag_count = 8
header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs")
tiff = header.dup
tiff << short_tag.call( 256, 1, w ) # image width
tiff << short_tag.call( 257, 1, h ) # image height
tiff << long_tag.call( 258, 3, (header.size + (tag_count*12) + 4)) # bits per pixel
tiff << short_tag.call( 259, 1, 1 ) # compression
tiff << short_tag.call( 262, 1, 2 ) # colorspace - RGB
tiff << long_tag.call( 273, 1, (header.size + (tag_count*12) + 16) ) # data offset
tiff << short_tag.call( 277, 1, 3 ) # samples per pixel
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
tiff << [0].pack("I") # next IFD pointer
tiff << [bpc, bpc, bpc].pack("III")
tiff << stream.unfiltered_data
File.open(filename, "wb") { |file| file.write tiff }
end
end
class Jpg
attr_reader :stream
def initialize(stream)
@stream = stream
end
def save(filename)
w = stream.hash[:Width]
h = stream.hash[:Height]
puts "#{filename}: h=#{h}, w=#{w}"
File.open(filename, "wb") { |file| file.write stream.data }
end
end
class Tiff
attr_reader :stream
def initialize(stream)
@stream = stream
end
def save(filename)
if stream.hash[:DecodeParms][:K] <= 0
save_group_four(filename)
else
$stderr.puts "#{filename}: CCITT non-group 4/2D image."
end
end
private
# Group 4, 2D
def save_group_four(filename)
k = stream.hash[:DecodeParms][:K]
h = stream.hash[:Height]
w = stream.hash[:Width]
bpc = stream.hash[:BitsPerComponent]
mask = stream.hash[:ImageMask]
len = stream.hash[:Length]
cols = stream.hash[:DecodeParms][:Columns]
puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, mask=#{mask}, len=#{len}, cols=#{cols}, k=#{k}"
# Synthesize a TIFF header
long_tag = lambda {|tag, value| [ tag, 4, 1, value ].pack( "ssII" ) }
short_tag = lambda {|tag, value| [ tag, 3, 1, value ].pack( "ssII" ) }
# header = byte order, version magic, offset of directory, directory count,
# followed by a series of tags containing metadata: 259 is a magic number for
# the compression type; 273 is the offset of the image data.
tiff = [ 73, 73, 42, 8, 5 ].pack("ccsIs") \
+ short_tag.call( 256, cols ) \
+ short_tag.call( 257, h ) \
+ short_tag.call( 259, 4 ) \
+ long_tag.call( 273, (10 + (5*12) + 4) ) \
+ long_tag.call( 279, len) \
+ [0].pack("I") \
+ stream.data
File.open(filename, "wb") { |file| file.write tiff }
end
end
end
filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/adobe_sample.pdf"
extractor = ExtractImages::Extractor.new
PDF::Reader.open(filename) do |reader|
page = reader.page(1)
extractor.page(page)
end
|