File: wb_read_file.py

package info (click to toggle)
svn-workbench 1.6.8-2.1
  • links: PTS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 2,172 kB
  • ctags: 2,147
  • sloc: python: 15,925; sh: 108; makefile: 15; ansic: 9
file content (61 lines) | stat: -rw-r--r-- 1,815 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
'''
 ====================================================================
 Copyright (c) 2005-2010 Barry A Scott.  All rights reserved.

 This software is licensed as described in the file LICENSE.txt,
 which you should have received as part of this distribution.

 ====================================================================

    wb_read_file.py

'''
import locale
import codecs

import wb_platform_specific

def readFileContentsAsUnicode( filename ):
    f = wb_platform_specific.uOpen( filename, 'r' )
    contents = f.read()
    f.close()

    return contentsAsUnicode( contents )

def contentsAsUnicode( contents ):
    encoding = encodingFromContents( contents )

    try:
        return contents.decode( encoding )
    except UnicodeDecodeError:
        try:
            # use the choosen encoding and replace chars in error
            return contents.decode( encoding, 'replace' )
        except UnicodeDecodeError:
            # fall back to latin-1
            return contents.decode( 'iso8859-1', 'replace' )

def encodingFromContents( contents ):
    if( len(contents) > len(codecs.BOM_UTF8)
    and contents[0:len(codecs.BOM_UTF8)] == codecs.BOM_UTF8 ):
        encoding = 'utf-8'

    elif( len(contents) > len(codecs.BOM_UTF16_LE)
    and contents[0:len(codecs.BOM_UTF16_LE)] in [codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE]):
        encoding = 'utf-16'

    elif( len(contents) > len(codecs.BOM_UTF32_LE)
    and contents[0:len(codecs.BOM_UTF32_LE)] in [codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE]):
        encoding = 'utf-32'

    else:
        encoding = locale.getdefaultlocale()[1]

        # Mac says mac-roman when utf-8 is what is required
        if encoding == 'mac-roman':
            encoding = 'utf-8'

    if encoding is None:
        encoding = 'iso8859-1'

    return encoding