1 ;;;; CHAR-CODE -- Conversions between characters and byte
2 ;;;; representations thereof
6 (:export "MAKE-ENCODER" "MAKE-DECODER" "ENCODE-STRING" "DECODE-STRING" "SYSTEM-CHARSET"
7 "NO-CODEC-ERROR" "CODING-ERROR"
8 "MAKE-CODEC-CHARACTER-STREAM"
9 "ASCII" "LATIN-1" "LATIN1" "UTF-8" "UTF8"))
10 (in-package :charcode)
14 (define-condition no-codec-error (error)
15 ((codec-name :initarg :codec-name))
16 (:report (lambda (c s)
17 (with-slots (codec-name) c
18 (format s "Could find no codec named ~A." codec-name)))))
20 (define-condition coding-error (error)
21 ((input :initarg :input)
22 (position :initarg :position)
23 (result :initarg :result)))
25 (define-condition simple-coding-error (coding-error simple-error) ())
27 (defun coding-error (input position result format &rest format-args)
28 (error 'simple-coding-error
29 :input input :position position :result result
30 :format-control format :format-arguments format-args))
32 (deftype decoder-fun () `(function ((array (unsigned-byte 8))
34 &key (start fixnum) (end fixnum))
36 (deftype encoder-fun () `(function ((array character)
37 (array (unsigned-byte 8))
38 &key (start fixnum) (end fixnum))
41 (defmacro define-encoder ((name) &body make-encoder)
42 `(setf (get ',name 'make-encoder) #'(lambda () ,@make-encoder)))
44 (defmacro define-decoder ((name) &body make-decoder)
45 `(setf (get ',name 'make-decoder) #'(lambda () ,@make-decoder)))
47 (defmacro define-codec-synonyms (name &rest synonyms)
48 `(eval-when (:load-toplevel :execute)
49 ,@(mapcar #'(lambda (sym)
50 `(setf (get ',sym 'make-encoder) (get ',name 'make-encoder)
51 (get ',sym 'make-decoder) (get ',name 'make-decoder)))
54 (defun make-encoder (name)
55 (the encoder-fun (values (funcall (or (get name 'make-encoder)
56 (error 'no-codec-error :codec-name name))))))
58 (defun make-decoder (name)
59 (the decoder-fun (values (funcall (or (get name 'make-decoder)
60 (error 'no-codec-error :codec-name name))))))
62 (defun system-charset ()
63 ;; XXX: Replace me with something perhaps more sensible.
66 (defun encode-string (string &optional (coding (system-charset)))
67 (declare (type string string))
68 (let ((encoder (make-encoder coding))
69 (buf (make-array (list (length string)) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0)))
70 (unless (funcall encoder string buf)
71 (coding-error string (length string) buf "Encoding of string in ~A ended prematurely." coding))
74 (defun decode-string (buffer &optional (coding (system-charset)))
75 (declare (type (array (unsigned-byte 8)) buffer))
76 (let ((decoder (make-decoder coding))
77 (buf (make-array (list (length buffer)) :element-type 'character :adjustable t :fill-pointer 0)))
78 (unless (funcall decoder buffer buf)
79 (coding-error buffer (length buffer) buf "~A byte sequence ended prematurely." coding))
82 ;;; Gray stream implementation
84 ;; Disabled for now. There doesn't seem to be any good way to get
85 ;; these working generally over various implementations.
88 (defclass codec-character-stream (fundamental-character-input-stream fundamental-character-output-stream)
89 ((decoder :initarg :decoder)
90 (encoder :initarg :encoder)
92 (read-pos :initform 0)
93 (buffer :initform (make-array '(64) :element-type 'character :adjustable t :fill-pointer 0))))
95 (defun make-codec-character-stream (real-stream &optional (charset (system-charset)))
96 (declare (type stream real-stream))
97 (make-instance 'codec-character-stream :decoder (make-decoder charset) :encoder (make-encoder charset) :back real-stream))
99 (defmethod close ((stream codec-character-stream) &key abort)
100 (with-slots (back) stream
101 (close back :abort abort))
104 (defmethod open-stream-p ((stream codec-character-stream))
105 (with-slots (back) stream
106 (open-stream-p stream)))
108 (defun ccs-ensure-buffer (stream len)
109 (declare (type codec-character-stream stream)
111 (with-slots (decoder back buffer read-pos) stream
112 (let ((readbuf (make-array (list len) :element-type '(unsigned-byte 8))))
113 (loop (unless (< (- (length buffer) read-pos) len) (return t))
114 (let ((readlen (read-sequence readbuf back :end (- len (- (length buffer) read-pos)))))
116 (return-from ccs-ensure-buffer nil))
117 (funcall decoder readbuf buffer :end readlen))))))
119 (defun ccs-clear-buffer (stream)
120 (declare (type codec-character-stream stream))
121 (with-slots (read-pos buffer) stream
122 (replace buffer buffer :start2 read-pos)
123 (setf (fill-pointer buffer) (- (fill-pointer buffer) read-pos)
126 (defmethod stream-read-char ((stream codec-character-stream))
127 (unless (ccs-ensure-buffer stream 1)
128 (return-from stream-read-char :eof))
129 (with-slots (read-pos buffer) stream
130 (prog1 (aref buffer read-pos)
131 (when (>= (incf read-pos) 16)
132 (ccs-clear-buffer stream)))))
134 (defmethod stream-unread-char ((stream codec-character-stream) char)
135 (with-slots (read-pos buffer) stream
137 (let ((len (length buffer)))
138 (when (< (array-dimension buffer 0) (+ len 16))
139 (adjust-array buffer (list (setf (fill-pointer buffer)
141 (replace buffer buffer :start1 16 :end2 len)))
142 (setf (aref buffer (decf read-pos)) char)
145 (defun ccs-wont-hang-p (stream)
146 (declare (type codec-character-stream stream))
147 (with-slots (read-pos back buffer) stream
148 (or (and (< read-pos (length buffer)) (aref buffer read-pos))
151 (defmethod stream-read-char-no-hang ((stream codec-character-stream))
152 (if (ccs-wont-hang-p stream)
153 (stream-read-char stream)
156 (defmethod stream-peek-char ((stream codec-character-stream))
157 (unless (ccs-ensure-buffer stream 1)
158 (return-from stream-peek-char :eof))
159 (with-slots (read-pos buffer) stream
160 (aref buffer read-pos)))
162 (defmethod stream-listen ((stream codec-character-stream))
163 (if (ccs-wont-hang-p stream)
164 (let ((peek (stream-peek-char stream)))
170 (defmethod stream-write-char ((stream codec-character-stream) char)
171 (with-slots (encoder back) stream
172 (let ((seq (make-array '(1) :element-type 'character :initial-element char))
173 (outbuf (make-array '(16) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0)))
174 (funcall encoder seq outbuf)
175 (write-sequence outbuf back))))
177 (defmethod stream-finish-output ((stream codec-character-stream))
178 (finish-output (slot-value stream 'back)))
180 (defmethod stream-force-output ((stream codec-character-stream))
181 (force-output (slot-value stream 'back)))
183 (defmethod stream-read-sequence ((stream codec-character-stream) seq &optional (start 0) (end (length seq)))
184 (ccs-ensure-buffer stream (- end start))
185 (with-slots (read-pos buffer) stream
186 (replace seq buffer :start1 start :end1 end :start2 read-pos :end2 (length buffer))
187 (let ((len (min (- end start) (- (length buffer) read-pos))))
188 (when (>= (incf read-pos len) 128)
189 (ccs-clear-buffer stream)))))
191 (defmethod stream-write-sequence ((stream codec-character-stream) seq &optional (start 0) (end (length seq)))
192 (with-slots (encoder back) stream
193 (let ((outbuf (make-array (list (- end start)) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0)))
194 (funcall encoder seq outbuf)
195 (write-sequence outbuf back))))
198 ;;; Implementation-specific functions
200 #+(or (and clisp unicode) sbcl)
201 (defun unicode->char (unicode)
202 (declare (type (unsigned-byte 24) unicode))
205 #+(or (and clisp unicode) sbcl)
206 (defun char->unicode (char)
207 (declare (type character char))
212 (defun decode-ascii (byteseq charseq &key (start 0) (end (length byteseq)))
213 (declare (type (array (unsigned-byte 8)) byteseq)
214 (type (array character) charseq)
215 (type fixnum start end))
219 (unless (< start end) (return-from decode-ascii t))
220 (let ((byte (aref byteseq (prog1 start (incf start)))))
222 (coding-error byteseq start charseq "Invalid byte ~D in ASCII stream." byte))
223 (vector-push-extend (unicode->char byte) charseq)))
224 (:replace-char (&optional (replacement (unicode->char #xfffd)))
225 :report "Replace the invalid byte with a character."
226 (vector-push-extend replacement charseq))
228 :report "Ignore the invalid byte."
231 (defun encode-ascii (charseq byteseq &key (start 0) (end (length charseq)))
232 (declare (type (array (unsigned-byte 8)) byteseq)
233 (type (array character) charseq)
234 (type fixnum start end))
238 (unless (< start end) (return-from encode-ascii t))
239 (vector-push-extend (let ((cp (char->unicode (aref charseq (prog1 start (incf start))))))
241 (coding-error charseq start byteseq "ASCII cannot encode code-points higher than 128."))
244 (:replace-char (&optional (replacement #\?))
245 :report "Replace this character with another."
246 (vector-push-extend (char->unicode replacement) byteseq))
248 :report "Ignore this character."
251 (define-decoder (ascii)
254 (define-encoder (ascii)
257 (define-codec-synonyms ascii :ascii)
261 (defun decode-latin-1 (byteseq charseq &key (start 0) (end (length byteseq)))
262 (declare (type (array (unsigned-byte 8)) byteseq)
263 (type (array character) charseq)
264 (type fixnum start end))
265 (do ((i start (1+ i)))
267 (vector-push-extend (unicode->char (aref byteseq i)) charseq))
270 (defun encode-latin-1 (charseq byteseq &key (start 0) (end (length charseq)))
271 (declare (type (array (unsigned-byte 8)) byteseq)
272 (type (array character) charseq)
273 (type fixnum start end))
277 (unless (< start end) (return-from encode-latin-1 t))
278 (vector-push-extend (let ((cp (char->unicode (aref charseq (prog1 start (incf start))))))
280 (coding-error charseq start byteseq "ISO-8859-1 cannot encode code-points higher than 256."))
283 (:replace-char (&optional (replacement #\?))
284 :report "Replace this character with another."
285 (vector-push-extend (char->unicode replacement) byteseq))
287 :report "Ignore this character."
290 (define-decoder (latin-1)
293 (define-encoder (latin-1)
296 (define-codec-synonyms latin-1 latin1 iso-8859-1 :latin-1 :latin1 :iso-8859-1)
300 (defun encode-utf-8 (charseq byteseq &key (start 0) (end (length charseq)))
301 (declare (type (array (unsigned-byte 8)) byteseq)
302 (type (array character) charseq)
303 (type fixnum start end))
304 (do ((i start (1+ i)))
306 (let ((cp (char->unicode (aref charseq i))))
308 (vector-push-extend cp byteseq)
312 (push (logior (ldb (byte 6 0) cp) #x80) bytes)
313 (setf cp (truncate cp 64))
315 (when (< cp (expt 2 (- 6 nbytes)))
316 (push (logior (logand #xff (lognot (1- (expt 2 (- 7 nbytes)))))
321 (vector-push-extend byte byteseq))))))
324 (define-encoder (utf-8)
327 (define-decoder (utf-8)
330 (flet ((decode (byteseq charseq &key (start 0) (end (length byteseq)))
331 (declare (type (array (unsigned-byte 8)) byteseq)
332 (type (array character) charseq)
333 (type fixnum start end))
335 (flet ((failure (format &rest args)
336 (error 'simple-coding-error
337 :input byteseq :position i :result charseq
338 :format-control format :format-arguments args)))
343 (unless (< i end) (return))
344 (let ((byte (aref byteseq (prog1 i (incf i)))))
347 (vector-push-extend (unicode->char byte) charseq)
348 (setf mlen (block zero
350 (when (= (ldb (byte 1 (- 7 i)) byte) 0)
352 (failure "UTF-8 sequence started with continuation byte: ~D" byte))
353 (return-from zero (1- i))))
354 (failure "Invalid UTF-8 sequence start byte: ~D" byte))
355 mbuf (ldb (byte (- 6 mlen) 0) byte)))
356 (progn (when (not (= (ldb (byte 2 6) byte) 2))
357 (failure "Invalid UTF-8 continuation byte: ~D" byte))
358 (setf mbuf (+ (* mbuf 64) (ldb (byte 6 0) byte)))
359 (when (= (decf mlen) 0)
361 (with-simple-restart (:accept "Accept anyway.")
362 (failure "UTF-8 multibyte sequence denoted an ASCII character ~S (either an encoding error or an attempt at breaking security)." (unicode->char mbuf))))
363 (vector-push-extend (unicode->char mbuf) charseq))))))
364 (return-from decode (= mlen 0)))
365 (:replace-char (&optional (replacement (unicode->char #xfffd)))
366 :report "Replace the invalid bytes with a character."
367 (vector-push-extend replacement charseq)
368 (loop (unless (and (< i end) (= (ldb (byte 2 6) (aref byteseq i)) 2))
373 :report "Ignore the invalid byte sequence."
374 (loop (unless (and (< i end) (= (ldb (byte 2 6) (aref byteseq i)) 2))
380 (define-codec-synonyms utf-8 utf8 :utf-8 :utf8)