File: bytevector-string.scm

package info (click to toggle)
scheme48 1.9.2-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 18,232 kB
  • sloc: lisp: 88,907; ansic: 87,519; sh: 3,224; makefile: 771
file content (113 lines) | stat: -rw-r--r-- 3,934 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
; Part of Scheme 48 1.9.  See file COPYING for notices and license.

; Authors: Harald Glab-Phlak, Mike Sperber

(define (string->utf8 string)
  (enc:string->utf-8 string))

; If the bytevector begins with the three-byte sequence #xef #xbb
; #xbf, then those bytes are ignored.  (They are conventionally used
; as a signature to indicate UTF-8 encoding.  The string->utf8
; procedure does not emit those bytes, but UTF-8 encodings produced by
; other sources may contain them.)
  
(define (replacement-character) 
  (integer->char #xfffd))

(define (begins-with-utf8-bom? bv)
  (let* ((n (bytevector-length bv)))
    (and (<= 3 n)
	 (= #xef (bytevector-u8-ref bv 0))
	 (= #xbb (bytevector-u8-ref bv 1))
	 (= #xbf (bytevector-u8-ref bv 2)))))

(define (utf8->string bv)
  (if (begins-with-utf8-bom? bv)
      (let ((start 3)
	    (count (- (bytevector-length bv) 3)))
	(enc:utf-8->string-n bv start count (replacement-character)))
      (enc:utf-8->string bv (replacement-character))))

(define string->utf16 
  (opt-lambda (string (endness #f))
    (let ((text-codec
	   (case endness
	     ((#f big) utf-16be-codec)
	     ((little) utf-16le-codec)
	     (else (endianness-violation 'string->utf16 endness)))))
      (enc:string->bytes text-codec string))))

(define (maybe-utf16-bom bytevector n)
  (and (<= 2 n)
       (let ((b0 (bytevector-u8-ref bytevector 0))
	     (b1 (bytevector-u8-ref bytevector 1)))
	 (or (and (= b0 #xfe) (= b1 #xff) (endianness big))
	     (and (= b0 #xff) (= b1 #xfe) (endianness little))))))

(define utf16->string 
  (opt-lambda (bytevector endness (endianness-mandatory? #f))
    (let ((n (bytevector-length bytevector)))
      (call-with-values
	  (lambda ()
	    (cond
	     (endianness-mandatory? (values endness 0))
	     ((maybe-utf16-bom bytevector n) 
	      => (lambda (endness)
		   (values endness 2)))
	     (else (values endness 0))))
	(lambda (endness start)
	  (let ((text-codec (case endness
			      ((big) utf-16be-codec)
			      ((little) utf-16le-codec)
			      (else
			       (endianness-violation 'utf16->string endness))))
		(conv-len (- n start)))
	    (if (not (zero? (remainder n 2)))
		(assertion-violation 'utf16->string "Bytevector has bad length." bytevector))
	    (enc:bytes->string-n text-codec bytevector start conv-len (replacement-character))))))))

; There is no utf-32-codec, so we can't use textual i/o for this.

(define string->utf32 
  (opt-lambda (string (endness #f))
    (let ((text-codec (case endness
			((#f big) utf-32be-codec)
			((little) utf-32le-codec)
			(else (endianness-violation 'string->utf32 endness)))))
      (enc:string->bytes text-codec string))))

(define (maybe-utf32-bom bytevector n)
  (and (<= 4 n)
       (let ((b0 (bytevector-u8-ref bytevector 0))
	     (b1 (bytevector-u8-ref bytevector 1))
	     (b2 (bytevector-u8-ref bytevector 2))
	     (b3 (bytevector-u8-ref bytevector 3)))
	 (or (and (= b0 0) (= b1 0) (= b2 #xfe) (= b3 #xff)
		  (endianness big))
	     (and (= b0 #xff) (= b1 #xfe) (= b2 0) (= b3 0)
		  (endianness little))))))
  
(define utf32->string 
  (opt-lambda (bytevector endness (endianness-mandatory? #f))
    (let ((n (bytevector-length bytevector)))
      (call-with-values
	  (lambda ()
	    (cond
	     (endianness-mandatory? (values endness 0))
	     ((maybe-utf32-bom bytevector n)
	      => (lambda (endness)
		   (values endness 4)))
	     (else (values endness 0))))
	(lambda (endness start)
	  (let ((text-codec (case endness
			      ((big) utf-32be-codec)
			      ((little) utf-32le-codec)
			      (else
			       (endianness-violation 'utf32->string endness))))
		(conv-len (- n start)))
	    (if (not (zero? (remainder n 4)))
		(assertion-violation 'utf32->string "Bytevector has bad length." bytevector))
	    (enc:bytes->string-n text-codec bytevector start conv-len (replacement-character))))))))

(define (endianness-violation who what)
  (assertion-violation who "bad endianness" what))