1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403
|
# -*- coding: utf-8 -*-
## Python bindings for GNU libextractor
##
## Copyright (C) 2006 Bader Ladjemi <bader@tele2.fr>
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program; see the file COPYING. If not, write to the
## Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
## USA.
##
"""
Python bindings for GNU libextractor
libextractor is a simple library for keyword extraction. libextractor
does not support all formats but supports a simple plugging mechanism
such that you can quickly add extractors for additional formats, even
without recompiling libextractor. libextractor typically ships with a
dozen helper-libraries that can be used to obtain keywords from common
file-types.
libextractor is a part of the GNU project (http://www.gnu.org/).
"""
from ctypes import *
#fake cdll import
try:
#loading shared object file
libextractor = cdll.LoadLibrary('libextractor.so.1')
except OSError:
libextractor = cdll.extractor
__all__ = ['Extractor', 'isBinaryType', 'EXTRACTOR_ENCODING', 'DEFAULT_LIBRARIES', 'EXTRACTOR_THUMBNAIL_DATA']
__version__ = "0.5"
__licence__ = "GNU GPL"
"""
keyword's charset encoding
"""
EXTRACTOR_ENCODING = "utf-8"
KeywordType = c_int
Keywords_p = POINTER('Keywords')
class Keywords(Structure):
"""
EXTRACTOR_Keywords struct
"""
_fields_ = [('keyword', c_char_p),
('keywordType', KeywordType),
('next', Keywords_p)]
SetPointerType(Keywords_p, Keywords)
KEYWORDS = POINTER(Keywords)
libextractor.EXTRACTOR_getKeywords.restype = KEYWORDS
libextractor.EXTRACTOR_getKeywords2.restype = KEYWORDS
libextractor.EXTRACTOR_removeDuplicateKeywords.restype = KEYWORDS
libextractor.EXTRACTOR_getKeywordTypeAsString.restype = c_char_p
libextractor.EXTRACTOR_getDefaultLibraries.restype = c_char_p
"""
thumbnail keyword type (binary)
"""
EXTRACTOR_THUMBNAIL_DATA = 70
def isBinaryType(keyword_type):
"""
returns if the given keyword_type is binary
@param keyword_type: keyword type (int)
"""
return keyword_type == EXTRACTOR_THUMBNAIL_DATA
"""
default loaded libraries
"""
DEFAULT_LIBRARIES = libextractor.EXTRACTOR_getDefaultLibraries().split(':')
class Extractor(object):
"""
Main class for extracting meta-data with GNU libextractor.
You may create multiple instances of Extractor to use
different sets of library. Initially each Extractor
will start with the default set of libraries.
Use the extract method to obtain keywords from a file.
Use the add and remove libraries methods to change the list of
libraries that should be used.
"""
def __init__(self, defaults=True, libraries=None, lang=None, languages=None, hash=None, use_filename=False, split_keywords=False):
"""
Initialize Extractor's instance
@param extractors: list of strings that contains extractor's name (supported types)
@param defaults: load default plugins
@param lang: use the generic plaintext extractor for the language with the 2-letter language code LANG
@param languages: list of lang
@param hash: compute hash using the given algorithm (currently 'sha1' or 'md5')
@param use_filename: use the filename as a keyword (add filename-extractor library)
@param split_keywords: use keyword splitting (add split-extractor library)
>>> Extractor() #doctest: +ELLIPSIS
<__main__.Extractor object at 0x...>
>>> extractor = Extractor(defaults=False)
>>> extractor.libraries
()
>>> extractor = Extractor()
>>> sorted(extractor.libraries) == sorted(tuple(DEFAULT_LIBRARIES))
True
>>> extractor = Extractor(hash='md5')
>>> found = False
>>> for library in extractor.libraries:
... if 'md5' in library:
... found = True
... break
>>> found
True
>>> extractor = Extractor(use_filename=True)
>>> found = False
>>> for library in extractor.libraries:
... if 'filename' in library:
... found = True
... break
>>> found
True
>>> extractor = Extractor(split_keywords=True)
>>> found = False
>>> for library in extractor.libraries:
... if 'split' in library:
... found = True
... break
>>> found
True
"""
self._libraries = {}
self.extractors = None
if defaults:
self.extractors = libextractor.EXTRACTOR_loadDefaultLibraries()
self._libraries = dict([(library, None) for library in DEFAULT_LIBRARIES])
if use_filename:
self.addLibrary("libextractor_filename")
if libraries:
self.extractors = libextractor.EXTRACTOR_loadConfigLibraries(self.extractors, libraries)
self._libraries.update(dict([(library, None) for library in libraries.split(':')]))
if isinstance(lang, str):
self.addLibraryLast("libextractor_printable_%s" % lang)
if isinstance(hash, str):
self.addLibraryLast("libextractor_hash_%s" % hash)
if languages:
[self.addLibraryLast("libextractor_printable_%s" % language) for language in languages]
if split_keywords:
self.addLibraryLast("libextractor_split")
def extract(self, filename=None, data=None, size=None):
"""Extract keywords from a file, or from its data.
@param filename: filename string
@param data: data contents
@param size: data size
This function returns a list of tuples. Its first value is keyword type
and its second value is keyword value. If the file cannot be opened
or cannot be found, the list will be empty. The list can
also be empty if no keyword was found for the file.
If you give data, size had to be given too.
"""
if not filename and not (data and size):
return None
elif filename:
return self.extractFromFile(filename)
else:
return self.extractFromData(data, size)
def extractFromFile(self, filename):
"""Extract keywords from a file using its filename.
@param filename: filename string
This function returns a list of tuples. Its first value is keyword type
and its second value is keyword value. If the file cannot be opened
or cannot be found, the list will be empty. The list can
also be empty if no keyword was found for the file.
>>> import os
>>> extractor = Extractor()
>>> filename = os.tmpnam()
>>> f = file(filename, 'w')
>>> extractor.extract(filename)
[]
>>> import os
>>> extractor = Extractor()
>>> filename = '../Extractor/test/test.png'
>>> extractor.extract(filename)
[(u'comment', u'Testing keyword extraction\\n'), (u'resource-identifier', u'dc6c58c971715e8043baef058b675eec'), (u'size', u'4x4'), (u'mimetype', u'image/png')]
>>> import os, glob
>>> extractor = Extractor()
>>> filename = glob.glob('dist/*.gz')[0]
>>> extracted = extractor.extract(filename)
>>> filename_count = 0
>>> for keyword_type, keyword in extracted:
... if keyword_type == 'filename':
... filename_count += 1
>>> filename_count > 1
True
"""
self.keywords_p = libextractor.EXTRACTOR_getKeywords(self.extractors, filename)
return self._extract()
def extractFromData(self, data, size):
"""Extract keywords using its data.
@param data: data contents
@param size: data size
This function returns a list of tuples. Its first value is keyword type
and its second value is keyword value. If the file cannot be opened
or cannot be found, the list will be empty. The list can
also be empty if no keyword was found for the file.
"""
self.keywords_p = libextractor.EXTRACTOR_getKeywords2(self.extractors, data, size)
return self._extract()
def _extract(self):
self.extracted = []
if not self.keywords_p:
return self.extracted
try:
self.keywords = self.keywords_p.contents
except ValueError:
return self.extracted
while True:
keyword_type = libextractor.EXTRACTOR_getKeywordTypeAsString(self.keywords.keywordType).decode(EXTRACTOR_ENCODING)
keyword = self.keywords.keyword
if not isBinaryType(self.keywords.keywordType):
keyword = keyword.decode(EXTRACTOR_ENCODING)
self.extracted.append((keyword_type, keyword))
try:
self.keywords = self.keywords.next.contents
except ValueError:
libextractor.EXTRACTOR_freeKeywords(self.keywords_p)
self.keywords_p = None
return self.extracted
def addLibrary(self, library):
"""
Add given library to the extractor. Invoke with a string with the name
of the library that should be added. For example,
'libextractor_filename'
will prepend the extractor that just adds the filename as a
keyword.
No errors are reported if the library is not
found.
@param library: library's name
"""
self._libraries[library] = None
self.extractors = libextractor.EXTRACTOR_addLibrary(self.extractors, library)
def addLibraryLast(self, library):
"""
Same as addLibrary but the library is added at the last.
@param library: library's name
"""
self._libraries[library] = None
self.extractors = libextractor.EXTRACTOR_addLibraryLast(self.extractors, library)
def removeLibrary(self, library):
"""
Remove a library. Pass the name of the library that is to
be removed. Only one library can be removed at a time.
For example,
'libextractor_pdf'
removes the PDF extractor (if added).
ValueError will be thrown if no library match.
@param library: library's name
"""
try:
del self._libraries[library]
except KeyError:
raise ValueError, "No such loaded library"
self.extractors = libextractor.EXTRACTOR_removeLibrary(self.extractors, library)
def addLibraries(self, libraries):
"""
Add given libraries.
Same as addLibary but libraries is a list of library's names.
@param libraries: list of libraries names
"""
for library in libraries:
if isinstance(library, str):
self.addLibrary(library)
def removeAllLibraries(self):
"""
Remove all libraries.
>>> extractor = Extractor()
>>> extractor.removeAllLibraries()
>>> extractor.libraries
()
"""
self._libraries = {}
if self.extractors:
libextractor.EXTRACTOR_removeAll(self.extractors)
self.extractors = None
def keywordTypes(self):
"""
Returns the list of all keywords types.
@return: list of all keywords types
>>> extractor = Extractor()
>>> extractor.keywordTypes()
('unknown', 'filename', 'mimetype', 'title', 'author', 'artist', 'description', 'comment', 'date', 'publisher', 'language', 'album', 'genre', 'location', 'version', 'organization', 'copyright', 'subject', 'keywords', 'contributor', 'resource-type', 'format', 'resource-identifier', 'source', 'relation', 'coverage', 'software', 'disclaimer', 'warning', 'translated', 'creation date', 'modification date', 'creator', 'producer', 'page count', 'page orientation', 'paper size', 'used fonts', 'page order', 'created for', 'magnification', 'release', 'group', 'size', 'summary', 'packager', 'vendor', 'license', 'distribution', 'build-host', 'os', 'dependency', 'MD4', 'MD5', 'SHA-0', 'SHA-1', 'RipeMD160', 'resolution', 'category', 'book title', 'priority', 'conflicts', 'replaces', 'provides', 'conductor', 'interpreter', 'owner', 'lyrics', 'media type', 'contact', 'binary thumbnail data', 'publication date', 'camera make', 'camera model', 'exposure', 'aperture', 'exposure bias', 'flash', 'flash bias', 'focal length', 'focal length (35mm equivalent)', 'iso speed', 'exposure mode', 'metering mode', 'macro mode', 'image quality', 'white balance', 'orientation')
"""
i = 0
keyword_types = []
while True:
keyword_type = libextractor.EXTRACTOR_getKeywordTypeAsString(i)
if not keyword_type:
break
keyword_types.append(keyword_type)
i += 1
return tuple(keyword_types)
def _get_libraries(self):
"""
Return current libraries
@return: current libraries
"""
return tuple(self._libraries.keys())
def _set_libraries(self, libraries):
"""
Add libraries to load (don't replace current ones)
@param libraries: list of libraries
>>> extractor = Extractor()
>>> extractor.libraries = ('libextractor_filename', )
>>> 'libextractor_filename' in extractor.libraries
True
>>> len(extractor.libraries) == len(DEFAULT_LIBRARIES)+1
True
"""
self.addLibraries(libraries)
libraries = property(fget=_get_libraries, fset=_set_libraries, fdel=removeAllLibraries, doc='tuple of loaded libraries')
def __del__(self):
"""
>>> extractor = Extractor()
>>> del extractor
"""
if self.extractors:
self.removeAllLibraries()
if __name__ == "__main__":
import doctest
doctest.testmod()
|