1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
|
'''
====================================================================
Copyright (c) 2005-2006 Barry A Scott. All rights reserved.
This software is licensed as described in the file LICENSE.txt,
which you should have received as part of this distribution.
====================================================================
wb_read_file.py
'''
import locale
import codecs
def readFileContentsAsUnicode( filename ):
f = file( filename, 'r' )
contents = f.read()
return contentsAsUnicode( contents )
def contentsAsUnicode( contents ):
encoding = encodingFromContents( contents )
try:
return contents.decode( encoding )
except UnicodeDecodeError:
# fall back to latin-1
try:
return contents.decode( 'iso8859-1' )
except UnicodeDecodeError:
# sigh this is hard. use the choosen encoding and replace chars in error
return contents.decode( encoding, 'replace' )
def encodingFromContents( contents ):
if( len(contents) > len(codecs.BOM_UTF8)
and contents[0:len(codecs.BOM_UTF8)] == codecs.BOM_UTF8 ):
encoding = 'utf-8'
elif( len(contents) > len(codecs.BOM_UTF16_LE)
and contents[0:len(codecs.BOM_UTF16_LE)] in [codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE]):
encoding = 'utf-16'
elif( len(contents) > len(codecs.BOM_UTF32_LE)
and contents[0:len(codecs.BOM_UTF32_LE)] in [codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE]):
encoding = 'utf-32'
else:
encoding = locale.getdefaultlocale()[1]
if encoding is None:
encoding = 'iso8859-1'
return encoding
|