File: meta_data.rb

package info (click to toggle)
ruby-ole 1.2.11.3-1
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 628 kB
  • sloc: ruby: 2,870; makefile: 9
file content (150 lines) | stat: -rw-r--r-- 4,572 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# encoding: ASCII-8BIT

require 'ole/types/property_set'

module Ole
	class Storage
		#
		# The MetaData class is designed to be high level interface to all the
		# underlying meta data stored within different sections, themselves within
		# different property set streams.
		#
		# With this class, you can simply get properties using their names, without
		# needing to know about the underlying guids, property ids etc.
		#
		# Example:
		#
		#   Ole::Storage.open('test.doc') { |ole| p ole.meta_data.doc_author }
		#
		# TODO:
		#
		# * add write support
		# * fix some of the missing type coercion (eg FileTime)
		# * maybe add back the ability to access individual property sets as a unit
		#   directly. ie <tt>ole.summary_information</tt>. Is this useful?
		# * full key support, for unknown keys, like
		#   <tt>ole.meta_data[myguid, myid]</tt>. probably needed for user-defined
		#   properties too.
		#
		class MetaData
			include Enumerable

			FILE_MAP = {
				Types::PropertySet::FMTID_SummaryInformation => "\005SummaryInformation",
				Types::PropertySet::FMTID_DocSummaryInfo => "\005DocumentSummaryInformation"
			}

			FORMAT_MAP = {
				'MSWordDoc' => :doc
			}

			CLSID_EXCEL97 = Types::Clsid.parse "{00020820-0000-0000-c000-000000000046}"
			CLSID_EXCEL95 = Types::Clsid.parse "{00020810-0000-0000-c000-000000000046}"
			CLSID_WORD97  = Types::Clsid.parse "{00020906-0000-0000-c000-000000000046}"
			CLSID_WORD95  = Types::Clsid.parse "{00020900-0000-0000-c000-000000000046}"

			CLSID_MAP = {
				CLSID_EXCEL97 => :xls,
				CLSID_EXCEL95 => :xls,
				CLSID_WORD97  => :doc,
				CLSID_WORD95  => :doc
			}

			MIME_TYPES = {
				:xls => 'application/vnd.ms-excel',
				:doc => 'application/msword',
				:ppt => 'application/vnd.ms-powerpoint',
				# not registered at IANA, but seems most common usage
				:msg => 'application/vnd.ms-outlook',
				# this is my default fallback option. also not registered at IANA.
				# file(1)'s default is application/msword, which is useless...
				nil  => 'application/x-ole-storage'
			}

			def initialize ole
				@ole = ole
			end

			# i'm thinking of making file_format and mime_type available through
			# #[], #each, and #to_h also, as calculated meta data (not assignable)

			def comp_obj
				return {} unless dirent = @ole.root["\001CompObj"]
				data = dirent.read
				# see - https://gnunet.org/svn/Extractor/doc/StarWrite_File_Format.html
				# compobj_version: 0x0001
				# byte_order: 0xffe
				# windows_version: 0x00000a03 (win31 apparently)
				# marker: 0xffffffff
				compobj_version, byte_order, windows_version, marker, clsid =
					data.unpack("vvVVa#{Types::Clsid::SIZE}")
				strings = []
				i = 28
				while i < data.length
					len = data[i, 4].unpack('V').first
					i += 4
					strings << data[i, len - 1]
					i += len
				end
				# in the unknown chunk, you usually see something like 'Word.Document.6'
				{:username => strings[0], :file_format => strings[1], :unknown => strings[2..-1]}
			end
			private :comp_obj

			def file_format
				comp_obj[:file_format]
			end

			def mime_type
				# based on the CompObj stream contents
				type = FORMAT_MAP[file_format]
				return MIME_TYPES[type] if type

				# based on the root clsid
				type = CLSID_MAP[Types::Clsid.load(@ole.root.clsid)]
				return MIME_TYPES[type] if type

				# fallback to heuristics
				has_file = Hash[*@ole.root.children.map { |d| [d.name.downcase, true] }.flatten]
				return MIME_TYPES[:msg] if has_file['__nameid_version1.0'] or has_file['__properties_version1.0']
				return MIME_TYPES[:doc] if has_file['worddocument'] or has_file['document']
				return MIME_TYPES[:xls] if has_file['workbook'] or has_file['book']

				MIME_TYPES[nil]
			end

			def [] key
				pair = Types::PropertySet::PROPERTY_MAP[key.to_s] or return nil
				file = FILE_MAP[pair.first] or return nil
				dirent = @ole.root[file] or return nil
				dirent.open { |io| return Types::PropertySet.new(io)[key] }
			end

			def []= key, value
				raise NotImplementedError, 'meta data writes not implemented'
			end

			def each(&block)
				FILE_MAP.values.each do |file|
					dirent = @ole.root[file] or next
					dirent.open { |io| Types::PropertySet.new(io).each(&block) }
				end
			end

			def to_h
				inject({}) { |hash, (name, value)| hash.update name.to_sym => value }
			end

			def method_missing name, *args, &block
				return super unless args.empty?
				pair = Types::PropertySet::PROPERTY_MAP[name.to_s] or return super
				self[name]
			end
		end

		def meta_data
			@meta_data ||= MetaData.new(self)
		end
	end
end