Commit | Line | Data |
---|---|---|
fd26d811 FT |
1 | ;;;; CHAR-CODE -- Conversions between characters and byte |
2 | ;;;; representations thereof | |
3 | ||
4 | (defpackage :charcode | |
53d1dafe | 5 | (:use :cl) |
6f9e13dc | 6 | (:export "MAKE-ENCODER" "MAKE-DECODER" "ENCODE-STRING" "DECODE-STRING" "SYSTEM-CHARSET" |
c94c6f05 | 7 | "NO-CODEC-ERROR" "CODING-ERROR" |
ebf1ccf7 | 8 | "MAKE-CODEC-CHARACTER-STREAM" |
4dd02a73 | 9 | "ASCII" "LATIN-1" "LATIN1" "UTF-8" "UTF8")) |
fd26d811 FT |
10 | (in-package :charcode) |
11 | ||
12 | ;;; General stuff | |
13 | ||
c94c6f05 FT |
14 | (define-condition no-codec-error (error) |
15 | ((codec-name :initarg :codec-name)) | |
16 | (:report (lambda (c s) | |
17 | (with-slots (codec-name) c | |
18 | (format s "Could find no codec named ~A." codec-name))))) | |
19 | ||
fd26d811 FT |
20 | (define-condition coding-error (error) |
21 | ((input :initarg :input) | |
22 | (position :initarg :position) | |
23 | (result :initarg :result))) | |
24 | ||
25 | (define-condition simple-coding-error (coding-error simple-error) ()) | |
26 | ||
27 | (defun coding-error (input position result format &rest format-args) | |
28 | (error 'simple-coding-error | |
29 | :input input :position position :result result | |
30 | :format-control format :format-arguments format-args)) | |
31 | ||
32 | (deftype decoder-fun () `(function ((array (unsigned-byte 8)) | |
33 | (array character) | |
34 | &key (start fixnum) (end fixnum)) | |
35 | (member t nil))) | |
36 | (deftype encoder-fun () `(function ((array character) | |
37 | (array (unsigned-byte 8)) | |
38 | &key (start fixnum) (end fixnum)) | |
39 | (member t nil))) | |
40 | ||
41 | (defmacro define-encoder ((name) &body make-encoder) | |
42 | `(setf (get ',name 'make-encoder) #'(lambda () ,@make-encoder))) | |
43 | ||
44 | (defmacro define-decoder ((name) &body make-decoder) | |
45 | `(setf (get ',name 'make-decoder) #'(lambda () ,@make-decoder))) | |
46 | ||
47 | (defmacro define-codec-synonyms (name &rest synonyms) | |
48 | `(eval-when (:load-toplevel :execute) | |
49 | ,@(mapcar #'(lambda (sym) | |
50 | `(setf (get ',sym 'make-encoder) (get ',name 'make-encoder) | |
51 | (get ',sym 'make-decoder) (get ',name 'make-decoder))) | |
52 | synonyms))) | |
53 | ||
54 | (defun make-encoder (name) | |
c94c6f05 FT |
55 | (the encoder-fun (values (funcall (or (get name 'make-encoder) |
56 | (error 'no-codec-error :codec-name name)))))) | |
fd26d811 FT |
57 | |
58 | (defun make-decoder (name) | |
c94c6f05 FT |
59 | (the decoder-fun (values (funcall (or (get name 'make-decoder) |
60 | (error 'no-codec-error :codec-name name)))))) | |
fd26d811 | 61 | |
6f9e13dc FT |
62 | (defun system-charset () |
63 | ;; XXX: Replace me with something perhaps more sensible. | |
64 | 'utf-8) | |
65 | ||
66 | (defun encode-string (string &optional (coding (system-charset))) | |
fd26d811 FT |
67 | (declare (type string string)) |
68 | (let ((encoder (make-encoder coding)) | |
69 | (buf (make-array (list (length string)) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0))) | |
70 | (unless (funcall encoder string buf) | |
71 | (coding-error string (length string) buf "Encoding of string in ~A ended prematurely." coding)) | |
72 | buf)) | |
73 | ||
6f9e13dc | 74 | (defun decode-string (buffer &optional (coding (system-charset))) |
fd26d811 FT |
75 | (declare (type (array (unsigned-byte 8)) buffer)) |
76 | (let ((decoder (make-decoder coding)) | |
77 | (buf (make-array (list (length buffer)) :element-type 'character :adjustable t :fill-pointer 0))) | |
78 | (unless (funcall decoder buffer buf) | |
79 | (coding-error buffer (length buffer) buf "~A byte sequence ended prematurely." coding)) | |
80 | buf)) | |
81 | ||
82 | ;;; Gray stream implementation | |
83 | ||
53d1dafe FT |
84 | ;; Disabled for now. There doesn't seem to be any good way to get |
85 | ;; these working generally over various implementations. | |
86 | ||
87 | #+unused ( | |
fd26d811 FT |
88 | (defclass codec-character-stream (fundamental-character-input-stream fundamental-character-output-stream) |
89 | ((decoder :initarg :decoder) | |
90 | (encoder :initarg :encoder) | |
91 | (back :initarg :back) | |
92 | (read-pos :initform 0) | |
93 | (buffer :initform (make-array '(64) :element-type 'character :adjustable t :fill-pointer 0)))) | |
94 | ||
6f9e13dc | 95 | (defun make-codec-character-stream (real-stream &optional (charset (system-charset))) |
ebf1ccf7 FT |
96 | (declare (type stream real-stream)) |
97 | (make-instance 'codec-character-stream :decoder (make-decoder charset) :encoder (make-encoder charset) :back real-stream)) | |
98 | ||
fd26d811 FT |
99 | (defmethod close ((stream codec-character-stream) &key abort) |
100 | (with-slots (back) stream | |
101 | (close back :abort abort)) | |
102 | (call-next-method)) | |
103 | ||
104 | (defmethod open-stream-p ((stream codec-character-stream)) | |
105 | (with-slots (back) stream | |
106 | (open-stream-p stream))) | |
107 | ||
108 | (defun ccs-ensure-buffer (stream len) | |
109 | (declare (type codec-character-stream stream) | |
110 | (type integer len)) | |
111 | (with-slots (decoder back buffer read-pos) stream | |
112 | (let ((readbuf (make-array (list len) :element-type '(unsigned-byte 8)))) | |
113 | (loop (unless (< (- (length buffer) read-pos) len) (return t)) | |
75545f66 | 114 | (let ((readlen (read-sequence readbuf back :end (- len (- (length buffer) read-pos))))) |
fd26d811 FT |
115 | (when (= readlen 0) |
116 | (return-from ccs-ensure-buffer nil)) | |
117 | (funcall decoder readbuf buffer :end readlen)))))) | |
118 | ||
119 | (defun ccs-clear-buffer (stream) | |
120 | (declare (type codec-character-stream stream)) | |
121 | (with-slots (read-pos buffer) stream | |
122 | (replace buffer buffer :start2 read-pos) | |
75545f66 FT |
123 | (setf (fill-pointer buffer) (- (fill-pointer buffer) read-pos) |
124 | read-pos 0))) | |
fd26d811 FT |
125 | |
126 | (defmethod stream-read-char ((stream codec-character-stream)) | |
127 | (unless (ccs-ensure-buffer stream 1) | |
128 | (return-from stream-read-char :eof)) | |
129 | (with-slots (read-pos buffer) stream | |
130 | (prog1 (aref buffer read-pos) | |
131 | (when (>= (incf read-pos) 16) | |
132 | (ccs-clear-buffer stream))))) | |
133 | ||
134 | (defmethod stream-unread-char ((stream codec-character-stream) char) | |
135 | (with-slots (read-pos buffer) stream | |
136 | (when (= read-pos 0) | |
137 | (let ((len (length buffer))) | |
138 | (when (< (array-dimension buffer 0) (+ len 16)) | |
139 | (adjust-array buffer (list (setf (fill-pointer buffer) | |
140 | (+ len 16))))) | |
141 | (replace buffer buffer :start1 16 :end2 len))) | |
75545f66 | 142 | (setf (aref buffer (decf read-pos)) char) |
fd26d811 FT |
143 | nil)) |
144 | ||
145 | (defun ccs-wont-hang-p (stream) | |
146 | (declare (type codec-character-stream stream)) | |
147 | (with-slots (read-pos back buffer) stream | |
148 | (or (and (< read-pos (length buffer)) (aref buffer read-pos)) | |
149 | (listen back)))) | |
150 | ||
151 | (defmethod stream-read-char-no-hang ((stream codec-character-stream)) | |
152 | (if (ccs-wont-hang-p stream) | |
153 | (stream-read-char stream) | |
154 | nil)) | |
155 | ||
156 | (defmethod stream-peek-char ((stream codec-character-stream)) | |
157 | (unless (ccs-ensure-buffer stream 1) | |
158 | (return-from stream-peek-char :eof)) | |
159 | (with-slots (read-pos buffer) stream | |
160 | (aref buffer read-pos))) | |
161 | ||
162 | (defmethod stream-listen ((stream codec-character-stream)) | |
163 | (if (ccs-wont-hang-p stream) | |
164 | (let ((peek (stream-peek-char stream))) | |
165 | (if (eq peek :eof) | |
166 | nil | |
167 | peek)) | |
168 | nil)) | |
169 | ||
170 | (defmethod stream-write-char ((stream codec-character-stream) char) | |
171 | (with-slots (encoder back) stream | |
172 | (let ((seq (make-array '(1) :element-type 'character :initial-element char)) | |
173 | (outbuf (make-array '(16) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0))) | |
174 | (funcall encoder seq outbuf) | |
175 | (write-sequence outbuf back)))) | |
176 | ||
177 | (defmethod stream-finish-output ((stream codec-character-stream)) | |
178 | (finish-output (slot-value stream 'back))) | |
179 | ||
180 | (defmethod stream-force-output ((stream codec-character-stream)) | |
181 | (force-output (slot-value stream 'back))) | |
182 | ||
183 | (defmethod stream-read-sequence ((stream codec-character-stream) seq &optional (start 0) (end (length seq))) | |
184 | (ccs-ensure-buffer stream (- end start)) | |
185 | (with-slots (read-pos buffer) stream | |
186 | (replace seq buffer :start1 start :end1 end :start2 read-pos :end2 (length buffer)) | |
187 | (let ((len (min (- end start) (- (length buffer) read-pos)))) | |
188 | (when (>= (incf read-pos len) 128) | |
189 | (ccs-clear-buffer stream))))) | |
190 | ||
191 | (defmethod stream-write-sequence ((stream codec-character-stream) seq &optional (start 0) (end (length seq))) | |
192 | (with-slots (encoder back) stream | |
193 | (let ((outbuf (make-array (list (- end start)) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0))) | |
194 | (funcall encoder seq outbuf) | |
195 | (write-sequence outbuf back)))) | |
53d1dafe | 196 | ) |
fd26d811 FT |
197 | |
198 | ;;; Implementation-specific functions | |
199 | ||
200 | #+(or (and clisp unicode) sbcl) | |
201 | (defun unicode->char (unicode) | |
202 | (declare (type (unsigned-byte 24) unicode)) | |
203 | (code-char unicode)) | |
204 | ||
205 | #+(or (and clisp unicode) sbcl) | |
206 | (defun char->unicode (char) | |
207 | (declare (type character char)) | |
208 | (char-code char)) | |
209 | ||
dfa6197c FT |
210 | #+ecl |
211 | (defun unicode->char (unicode) | |
212 | (declare (type (unsigned-byte 24) unicode)) | |
213 | (when (>= unicode 256) | |
214 | (error "ECL does not handle Unicode characters outside Latin-1.")) | |
215 | (code-char unicode)) | |
216 | ||
217 | #+ecl | |
218 | (defun char->unicode (char) | |
219 | (declare (type character char)) | |
220 | (char-code char)) | |
221 | ||
4dd02a73 FT |
222 | ;;; ASCII |
223 | ||
224 | (defun decode-ascii (byteseq charseq &key (start 0) (end (length byteseq))) | |
225 | (declare (type (array (unsigned-byte 8)) byteseq) | |
226 | (type (array character) charseq) | |
227 | (type fixnum start end)) | |
228 | (loop | |
229 | (restart-case | |
230 | (loop | |
231 | (unless (< start end) (return-from decode-ascii t)) | |
232 | (let ((byte (aref byteseq (prog1 start (incf start))))) | |
233 | (unless (< byte 128) | |
234 | (coding-error byteseq start charseq "Invalid byte ~D in ASCII stream." byte)) | |
235 | (vector-push-extend (unicode->char byte) charseq))) | |
236 | (:replace-char (&optional (replacement (unicode->char #xfffd))) | |
237 | :report "Replace the invalid byte with a character." | |
238 | (vector-push-extend replacement charseq)) | |
239 | (:skip-char () | |
240 | :report "Ignore the invalid byte." | |
241 | nil)))) | |
242 | ||
243 | (defun encode-ascii (charseq byteseq &key (start 0) (end (length charseq))) | |
244 | (declare (type (array (unsigned-byte 8)) byteseq) | |
245 | (type (array character) charseq) | |
246 | (type fixnum start end)) | |
247 | (loop | |
248 | (restart-case | |
249 | (loop | |
250 | (unless (< start end) (return-from encode-ascii t)) | |
251 | (vector-push-extend (let ((cp (char->unicode (aref charseq (prog1 start (incf start)))))) | |
252 | (unless (< cp 128) | |
253 | (coding-error charseq start byteseq "ASCII cannot encode code-points higher than 128.")) | |
254 | cp) | |
255 | byteseq)) | |
256 | (:replace-char (&optional (replacement #\?)) | |
257 | :report "Replace this character with another." | |
258 | (vector-push-extend (char->unicode replacement) byteseq)) | |
259 | (:skip-char () | |
260 | :report "Ignore this character." | |
261 | nil)))) | |
262 | ||
263 | (define-decoder (ascii) | |
264 | #'decode-ascii) | |
265 | ||
266 | (define-encoder (ascii) | |
267 | #'encode-ascii) | |
268 | ||
f8eaaa29 FT |
269 | (define-codec-synonyms ascii :ascii) |
270 | ||
fd26d811 FT |
271 | ;;; Latin-1 |
272 | ||
273 | (defun decode-latin-1 (byteseq charseq &key (start 0) (end (length byteseq))) | |
274 | (declare (type (array (unsigned-byte 8)) byteseq) | |
275 | (type (array character) charseq) | |
276 | (type fixnum start end)) | |
277 | (do ((i start (1+ i))) | |
278 | ((>= i end)) | |
279 | (vector-push-extend (unicode->char (aref byteseq i)) charseq)) | |
280 | t) | |
281 | ||
282 | (defun encode-latin-1 (charseq byteseq &key (start 0) (end (length charseq))) | |
283 | (declare (type (array (unsigned-byte 8)) byteseq) | |
284 | (type (array character) charseq) | |
285 | (type fixnum start end)) | |
286 | (loop | |
287 | (restart-case | |
288 | (loop | |
289 | (unless (< start end) (return-from encode-latin-1 t)) | |
290 | (vector-push-extend (let ((cp (char->unicode (aref charseq (prog1 start (incf start)))))) | |
291 | (unless (< cp 256) | |
292 | (coding-error charseq start byteseq "ISO-8859-1 cannot encode code-points higher than 256.")) | |
293 | cp) | |
294 | byteseq)) | |
295 | (:replace-char (&optional (replacement #\?)) | |
296 | :report "Replace this character with another." | |
297 | (vector-push-extend (char->unicode replacement) byteseq)) | |
298 | (:skip-char () | |
299 | :report "Ignore this character." | |
300 | nil)))) | |
301 | ||
302 | (define-decoder (latin-1) | |
303 | #'decode-latin-1) | |
304 | ||
305 | (define-encoder (latin-1) | |
306 | #'encode-latin-1) | |
307 | ||
f8eaaa29 | 308 | (define-codec-synonyms latin-1 latin1 iso-8859-1 :latin-1 :latin1 :iso-8859-1) |
fd26d811 FT |
309 | |
310 | ;;; UTF-8 | |
311 | ||
312 | (defun encode-utf-8 (charseq byteseq &key (start 0) (end (length charseq))) | |
313 | (declare (type (array (unsigned-byte 8)) byteseq) | |
314 | (type (array character) charseq) | |
315 | (type fixnum start end)) | |
316 | (do ((i start (1+ i))) | |
317 | ((>= i end)) | |
318 | (let ((cp (char->unicode (aref charseq i)))) | |
319 | (if (< cp 128) | |
320 | (vector-push-extend cp byteseq) | |
321 | (let ((nbytes 0) | |
322 | (bytes '())) | |
323 | (loop | |
324 | (push (logior (ldb (byte 6 0) cp) #x80) bytes) | |
325 | (setf cp (truncate cp 64)) | |
326 | (incf nbytes) | |
327 | (when (< cp (expt 2 (- 6 nbytes))) | |
328 | (push (logior (logand #xff (lognot (1- (expt 2 (- 7 nbytes))))) | |
329 | cp) | |
330 | bytes) | |
331 | (return))) | |
332 | (dolist (byte bytes) | |
333 | (vector-push-extend byte byteseq)))))) | |
334 | t) | |
335 | ||
336 | (define-encoder (utf-8) | |
337 | #'encode-utf-8) | |
338 | ||
339 | (define-decoder (utf-8) | |
340 | (let ((mbuf 0) | |
341 | (mlen 0)) | |
342 | (flet ((decode (byteseq charseq &key (start 0) (end (length byteseq))) | |
343 | (declare (type (array (unsigned-byte 8)) byteseq) | |
344 | (type (array character) charseq) | |
345 | (type fixnum start end)) | |
346 | (let ((i start)) | |
347 | (flet ((failure (format &rest args) | |
348 | (error 'simple-coding-error | |
349 | :input byteseq :position i :result charseq | |
350 | :format-control format :format-arguments args))) | |
351 | (loop | |
352 | (restart-case | |
353 | (progn | |
354 | (loop | |
355 | (unless (< i end) (return)) | |
356 | (let ((byte (aref byteseq (prog1 i (incf i))))) | |
357 | (if (= mlen 0) | |
358 | (if (< byte 128) | |
359 | (vector-push-extend (unicode->char byte) charseq) | |
360 | (setf mlen (block zero | |
361 | (dotimes (i 7) | |
362 | (when (= (ldb (byte 1 (- 7 i)) byte) 0) | |
363 | (when (< i 2) | |
364 | (failure "UTF-8 sequence started with continuation byte: ~D" byte)) | |
365 | (return-from zero (1- i)))) | |
366 | (failure "Invalid UTF-8 sequence start byte: ~D" byte)) | |
367 | mbuf (ldb (byte (- 6 mlen) 0) byte))) | |
368 | (progn (when (not (= (ldb (byte 2 6) byte) 2)) | |
369 | (failure "Invalid UTF-8 continuation byte: ~D" byte)) | |
370 | (setf mbuf (+ (* mbuf 64) (ldb (byte 6 0) byte))) | |
371 | (when (= (decf mlen) 0) | |
372 | (when (< mbuf 128) | |
373 | (with-simple-restart (:accept "Accept anyway.") | |
374 | (failure "UTF-8 multibyte sequence denoted an ASCII character ~S (either an encoding error or an attempt at breaking security)." (unicode->char mbuf)))) | |
375 | (vector-push-extend (unicode->char mbuf) charseq)))))) | |
376 | (return-from decode (= mlen 0))) | |
377 | (:replace-char (&optional (replacement (unicode->char #xfffd))) | |
378 | :report "Replace the invalid bytes with a character." | |
379 | (vector-push-extend replacement charseq) | |
380 | (loop (unless (and (< i end) (= (ldb (byte 2 6) (aref byteseq i)) 2)) | |
381 | (return)) | |
382 | (incf i)) | |
383 | (setf mlen 0)) | |
384 | (:skip-char () | |
385 | :report "Ignore the invalid byte sequence." | |
386 | (loop (unless (and (< i end) (= (ldb (byte 2 6) (aref byteseq i)) 2)) | |
387 | (return)) | |
388 | (incf i)) | |
389 | (setf mlen 0)))))))) | |
390 | #'decode))) | |
391 | ||
f8eaaa29 | 392 | (define-codec-synonyms utf-8 utf8 :utf-8 :utf8) |