1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
|
/*
* govis: unicode aware vis(3) encoding implementation
* Copyright (C) 2017 SUSE LLC.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package govis
import (
"fmt"
"unicode"
)
func isunsafe(ch rune) bool {
return ch == '\b' || ch == '\007' || ch == '\r'
}
func isglob(ch rune) bool {
return ch == '*' || ch == '?' || ch == '[' || ch == '#'
}
// ishttp is defined by RFC 1808.
func ishttp(ch rune) bool {
// RFC1808 does not really consider characters outside of ASCII, so just to
// be safe always treat characters outside the ASCII character set as "not
// HTTP".
if ch > unicode.MaxASCII {
return false
}
return unicode.IsDigit(ch) || unicode.IsLetter(ch) ||
// Safe characters.
ch == '$' || ch == '-' || ch == '_' || ch == '.' || ch == '+' ||
// Extra characters.
ch == '!' || ch == '*' || ch == '\'' || ch == '(' ||
ch == ')' || ch == ','
}
func isgraph(ch rune) bool {
return unicode.IsGraphic(ch) && !unicode.IsSpace(ch) && ch <= unicode.MaxASCII
}
// vis converts a single *byte* into its encoding. While Go supports the
// concept of runes (and thus native utf-8 parsing), in order to make sure that
// the bit-stream will be completely maintained through an Unvis(Vis(...))
// round-trip. The downside is that Vis() will never output unicode -- but on
// the plus side this is actually a benefit on the encoding side (it will
// always work with the simple unvis(3) implementation). It also means that we
// don't have to worry about different multi-byte encodings.
func vis(b byte, flag VisFlag) (string, error) {
// Treat the single-byte character as a rune.
ch := rune(b)
// XXX: This is quite a horrible thing to support.
if flag&VisHTTPStyle == VisHTTPStyle {
if !ishttp(ch) {
return "%" + fmt.Sprintf("%.2X", ch), nil
}
}
// Figure out if the character doesn't need to be encoded. Effectively, we
// encode most "normal" (graphical) characters as themselves unless we have
// been specifically asked not to. Note though that we *ALWAYS* encode
// everything outside ASCII.
// TODO: Switch this to much more logical code.
if ch > unicode.MaxASCII {
/* ... */
} else if flag&VisGlob == VisGlob && isglob(ch) {
/* ... */
} else if isgraph(ch) ||
(flag&VisSpace != VisSpace && ch == ' ') ||
(flag&VisTab != VisTab && ch == '\t') ||
(flag&VisNewline != VisNewline && ch == '\n') ||
(flag&VisSafe != 0 && isunsafe(ch)) {
encoded := string(ch)
if ch == '\\' && flag&VisNoSlash == 0 {
encoded += "\\"
}
return encoded, nil
}
// Try to use C-style escapes first.
if flag&VisCStyle == VisCStyle {
switch ch {
case ' ':
return "\\s", nil
case '\n':
return "\\n", nil
case '\r':
return "\\r", nil
case '\b':
return "\\b", nil
case '\a':
return "\\a", nil
case '\v':
return "\\v", nil
case '\t':
return "\\t", nil
case '\f':
return "\\f", nil
case '\x00':
// Output octal just to be safe.
return "\\000", nil
}
}
// For graphical characters we generate octal output (and also if it's
// being forced by the caller's flags). Also spaces should always be
// encoded as octal.
if flag&VisOctal == VisOctal || isgraph(ch) || ch&0x7f == ' ' {
// Always output three-character octal just to be safe.
return fmt.Sprintf("\\%.3o", ch), nil
}
// Now we have to output meta or ctrl escapes. As far as I can tell, this
// is not actually defined by any standard -- so this logic is basically
// copied from the original vis(3) implementation. Hopefully nobody
// actually relies on this (octal and hex are better).
encoded := ""
if flag&VisNoSlash == 0 {
encoded += "\\"
}
// Meta characters have 0x80 set, but are otherwise identical to control
// characters.
if b&0x80 != 0 {
b &= 0x7f
encoded += "M"
}
if unicode.IsControl(rune(b)) {
encoded += "^"
if b == 0x7f {
encoded += "?"
} else {
encoded += fmt.Sprintf("%c", b+'@')
}
} else {
encoded += fmt.Sprintf("-%c", b)
}
return encoded, nil
}
// Vis encodes the provided string to a BSD-compatible encoding using BSD's
// vis() flags. However, it will correctly handle multi-byte encoding (which is
// not done properly by BSD's vis implementation).
func Vis(src string, flag VisFlag) (string, error) {
if flag&visMask != flag {
return "", fmt.Errorf("vis: flag %q contains unknown or unsupported flags", flag)
}
output := ""
for _, ch := range []byte(src) {
encodedCh, err := vis(ch, flag)
if err != nil {
return "", err
}
output += encodedCh
}
return output, nil
}
|