Added a constructor function for codec-character-streams.
[lisp-utils.git] / charcode.lisp
... / ...
CommitLineData
1;;;; CHAR-CODE -- Conversions between characters and byte
2;;;; representations thereof
3
4(defpackage :charcode
5 (:use :cl #+sbcl :sb-gray #-sbcl :gray)
6 (:export "MAKE-ENCODER" "MAKE-DECODER" "ENCODE-STRING" "DECODE-STRING"
7 "CODING-ERROR"
8 "MAKE-CODEC-CHARACTER-STREAM"
9 "LATIN-1" "LATIN1" "UTF-8" "UTF8"))
10(in-package :charcode)
11
12;;; General stuff
13
14(define-condition coding-error (error)
15 ((input :initarg :input)
16 (position :initarg :position)
17 (result :initarg :result)))
18
19(define-condition simple-coding-error (coding-error simple-error) ())
20
21(defun coding-error (input position result format &rest format-args)
22 (error 'simple-coding-error
23 :input input :position position :result result
24 :format-control format :format-arguments format-args))
25
26(deftype decoder-fun () `(function ((array (unsigned-byte 8))
27 (array character)
28 &key (start fixnum) (end fixnum))
29 (member t nil)))
30(deftype encoder-fun () `(function ((array character)
31 (array (unsigned-byte 8))
32 &key (start fixnum) (end fixnum))
33 (member t nil)))
34
35(defmacro define-encoder ((name) &body make-encoder)
36 `(setf (get ',name 'make-encoder) #'(lambda () ,@make-encoder)))
37
38(defmacro define-decoder ((name) &body make-decoder)
39 `(setf (get ',name 'make-decoder) #'(lambda () ,@make-decoder)))
40
41(defmacro define-codec-synonyms (name &rest synonyms)
42 `(eval-when (:load-toplevel :execute)
43 ,@(mapcar #'(lambda (sym)
44 `(setf (get ',sym 'make-encoder) (get ',name 'make-encoder)
45 (get ',sym 'make-decoder) (get ',name 'make-decoder)))
46 synonyms)))
47
48(defun make-encoder (name)
49 (the encoder-fun (values (funcall (get name 'make-encoder)))))
50
51(defun make-decoder (name)
52 (the decoder-fun (values (funcall (get name 'make-decoder)))))
53
54(defun encode-string (string coding)
55 (declare (type string string))
56 (let ((encoder (make-encoder coding))
57 (buf (make-array (list (length string)) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0)))
58 (unless (funcall encoder string buf)
59 (coding-error string (length string) buf "Encoding of string in ~A ended prematurely." coding))
60 buf))
61
62(defun decode-string (buffer coding)
63 (declare (type (array (unsigned-byte 8)) buffer))
64 (let ((decoder (make-decoder coding))
65 (buf (make-array (list (length buffer)) :element-type 'character :adjustable t :fill-pointer 0)))
66 (unless (funcall decoder buffer buf)
67 (coding-error buffer (length buffer) buf "~A byte sequence ended prematurely." coding))
68 buf))
69
70;;; Gray stream implementation
71
72(defclass codec-character-stream (fundamental-character-input-stream fundamental-character-output-stream)
73 ((decoder :initarg :decoder)
74 (encoder :initarg :encoder)
75 (back :initarg :back)
76 (read-pos :initform 0)
77 (buffer :initform (make-array '(64) :element-type 'character :adjustable t :fill-pointer 0))))
78
79(defun make-codec-character-stream (real-stream charset)
80 (declare (type stream real-stream))
81 (make-instance 'codec-character-stream :decoder (make-decoder charset) :encoder (make-encoder charset) :back real-stream))
82
83(defmethod close ((stream codec-character-stream) &key abort)
84 (with-slots (back) stream
85 (close back :abort abort))
86 (call-next-method))
87
88(defmethod open-stream-p ((stream codec-character-stream))
89 (with-slots (back) stream
90 (open-stream-p stream)))
91
92(defun ccs-ensure-buffer (stream len)
93 (declare (type codec-character-stream stream)
94 (type integer len))
95 (with-slots (decoder back buffer read-pos) stream
96 (let ((readbuf (make-array (list len) :element-type '(unsigned-byte 8))))
97 (loop (unless (< (- (length buffer) read-pos) len) (return t))
98 (let ((readlen (read-sequence readbuf back)))
99 (when (= readlen 0)
100 (return-from ccs-ensure-buffer nil))
101 (funcall decoder readbuf buffer :end readlen))))))
102
103(defun ccs-clear-buffer (stream)
104 (declare (type codec-character-stream stream))
105 (with-slots (read-pos buffer) stream
106 (replace buffer buffer :start2 read-pos)
107 (setf (fill-pointer buffer) (- (fill-pointer buffer) read-pos)
108 read-pos 0)))
109
110(defmethod stream-read-char ((stream codec-character-stream))
111 (unless (ccs-ensure-buffer stream 1)
112 (return-from stream-read-char :eof))
113 (with-slots (read-pos buffer) stream
114 (prog1 (aref buffer read-pos)
115 (when (>= (incf read-pos) 16)
116 (ccs-clear-buffer stream)))))
117
118(defmethod stream-unread-char ((stream codec-character-stream) char)
119 (with-slots (read-pos buffer) stream
120 (when (= read-pos 0)
121 (let ((len (length buffer)))
122 (when (< (array-dimension buffer 0) (+ len 16))
123 (adjust-array buffer (list (setf (fill-pointer buffer)
124 (+ len 16)))))
125 (replace buffer buffer :start1 16 :end2 len)))
126 (setf (aref buffer read-pos) char)
127 (decf read-pos)
128 nil))
129
130(defun ccs-wont-hang-p (stream)
131 (declare (type codec-character-stream stream))
132 (with-slots (read-pos back buffer) stream
133 (or (and (< read-pos (length buffer)) (aref buffer read-pos))
134 (listen back))))
135
136(defmethod stream-read-char-no-hang ((stream codec-character-stream))
137 (if (ccs-wont-hang-p stream)
138 (stream-read-char stream)
139 nil))
140
141(defmethod stream-peek-char ((stream codec-character-stream))
142 (unless (ccs-ensure-buffer stream 1)
143 (return-from stream-peek-char :eof))
144 (with-slots (read-pos buffer) stream
145 (aref buffer read-pos)))
146
147(defmethod stream-listen ((stream codec-character-stream))
148 (if (ccs-wont-hang-p stream)
149 (let ((peek (stream-peek-char stream)))
150 (if (eq peek :eof)
151 nil
152 peek))
153 nil))
154
155(defmethod stream-write-char ((stream codec-character-stream) char)
156 (with-slots (encoder back) stream
157 (let ((seq (make-array '(1) :element-type 'character :initial-element char))
158 (outbuf (make-array '(16) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0)))
159 (funcall encoder seq outbuf)
160 (write-sequence outbuf back))))
161
162(defmethod stream-finish-output ((stream codec-character-stream))
163 (finish-output (slot-value stream 'back)))
164
165(defmethod stream-force-output ((stream codec-character-stream))
166 (force-output (slot-value stream 'back)))
167
168(defmethod stream-read-sequence ((stream codec-character-stream) seq &optional (start 0) (end (length seq)))
169 (ccs-ensure-buffer stream (- end start))
170 (with-slots (read-pos buffer) stream
171 (replace seq buffer :start1 start :end1 end :start2 read-pos :end2 (length buffer))
172 (let ((len (min (- end start) (- (length buffer) read-pos))))
173 (when (>= (incf read-pos len) 128)
174 (ccs-clear-buffer stream)))))
175
176(defmethod stream-write-sequence ((stream codec-character-stream) seq &optional (start 0) (end (length seq)))
177 (with-slots (encoder back) stream
178 (let ((outbuf (make-array (list (- end start)) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0)))
179 (funcall encoder seq outbuf)
180 (write-sequence outbuf back))))
181
182;;; Implementation-specific functions
183
184#+(or (and clisp unicode) sbcl)
185(defun unicode->char (unicode)
186 (declare (type (unsigned-byte 24) unicode))
187 (code-char unicode))
188
189#+(or (and clisp unicode) sbcl)
190(defun char->unicode (char)
191 (declare (type character char))
192 (char-code char))
193
194;;; Latin-1
195
196(defun decode-latin-1 (byteseq charseq &key (start 0) (end (length byteseq)))
197 (declare (type (array (unsigned-byte 8)) byteseq)
198 (type (array character) charseq)
199 (type fixnum start end))
200 (do ((i start (1+ i)))
201 ((>= i end))
202 (vector-push-extend (unicode->char (aref byteseq i)) charseq))
203 t)
204
205(defun encode-latin-1 (charseq byteseq &key (start 0) (end (length charseq)))
206 (declare (type (array (unsigned-byte 8)) byteseq)
207 (type (array character) charseq)
208 (type fixnum start end))
209 (loop
210 (restart-case
211 (loop
212 (unless (< start end) (return-from encode-latin-1 t))
213 (vector-push-extend (let ((cp (char->unicode (aref charseq (prog1 start (incf start))))))
214 (unless (< cp 256)
215 (coding-error charseq start byteseq "ISO-8859-1 cannot encode code-points higher than 256."))
216 cp)
217 byteseq))
218 (:replace-char (&optional (replacement #\?))
219 :report "Replace this character with another."
220 (vector-push-extend (char->unicode replacement) byteseq))
221 (:skip-char ()
222 :report "Ignore this character."
223 nil))))
224
225(define-decoder (latin-1)
226 #'decode-latin-1)
227
228(define-encoder (latin-1)
229 #'encode-latin-1)
230
231(define-codec-synonyms latin-1 latin1 iso-8859-1)
232
233;;; UTF-8
234
235(defun encode-utf-8 (charseq byteseq &key (start 0) (end (length charseq)))
236 (declare (type (array (unsigned-byte 8)) byteseq)
237 (type (array character) charseq)
238 (type fixnum start end))
239 (do ((i start (1+ i)))
240 ((>= i end))
241 (let ((cp (char->unicode (aref charseq i))))
242 (if (< cp 128)
243 (vector-push-extend cp byteseq)
244 (let ((nbytes 0)
245 (bytes '()))
246 (loop
247 (push (logior (ldb (byte 6 0) cp) #x80) bytes)
248 (setf cp (truncate cp 64))
249 (incf nbytes)
250 (when (< cp (expt 2 (- 6 nbytes)))
251 (push (logior (logand #xff (lognot (1- (expt 2 (- 7 nbytes)))))
252 cp)
253 bytes)
254 (return)))
255 (dolist (byte bytes)
256 (vector-push-extend byte byteseq))))))
257 t)
258
259(define-encoder (utf-8)
260 #'encode-utf-8)
261
262(define-decoder (utf-8)
263 (let ((mbuf 0)
264 (mlen 0))
265 (flet ((decode (byteseq charseq &key (start 0) (end (length byteseq)))
266 (declare (type (array (unsigned-byte 8)) byteseq)
267 (type (array character) charseq)
268 (type fixnum start end))
269 (let ((i start))
270 (flet ((failure (format &rest args)
271 (error 'simple-coding-error
272 :input byteseq :position i :result charseq
273 :format-control format :format-arguments args)))
274 (loop
275 (restart-case
276 (progn
277 (loop
278 (unless (< i end) (return))
279 (let ((byte (aref byteseq (prog1 i (incf i)))))
280 (if (= mlen 0)
281 (if (< byte 128)
282 (vector-push-extend (unicode->char byte) charseq)
283 (setf mlen (block zero
284 (dotimes (i 7)
285 (when (= (ldb (byte 1 (- 7 i)) byte) 0)
286 (when (< i 2)
287 (failure "UTF-8 sequence started with continuation byte: ~D" byte))
288 (return-from zero (1- i))))
289 (failure "Invalid UTF-8 sequence start byte: ~D" byte))
290 mbuf (ldb (byte (- 6 mlen) 0) byte)))
291 (progn (when (not (= (ldb (byte 2 6) byte) 2))
292 (failure "Invalid UTF-8 continuation byte: ~D" byte))
293 (setf mbuf (+ (* mbuf 64) (ldb (byte 6 0) byte)))
294 (when (= (decf mlen) 0)
295 (when (< mbuf 128)
296 (with-simple-restart (:accept "Accept anyway.")
297 (failure "UTF-8 multibyte sequence denoted an ASCII character ~S (either an encoding error or an attempt at breaking security)." (unicode->char mbuf))))
298 (vector-push-extend (unicode->char mbuf) charseq))))))
299 (return-from decode (= mlen 0)))
300 (:replace-char (&optional (replacement (unicode->char #xfffd)))
301 :report "Replace the invalid bytes with a character."
302 (vector-push-extend replacement charseq)
303 (loop (unless (and (< i end) (= (ldb (byte 2 6) (aref byteseq i)) 2))
304 (return))
305 (incf i))
306 (setf mlen 0))
307 (:skip-char ()
308 :report "Ignore the invalid byte sequence."
309 (loop (unless (and (< i end) (= (ldb (byte 2 6) (aref byteseq i)) 2))
310 (return))
311 (incf i))
312 (setf mlen 0))))))))
313 #'decode)))
314
315(define-codec-synonyms utf-8 utf8)