| 1 | ;;;; CHAR-CODE -- Conversions between characters and byte |
| 2 | ;;;; representations thereof |
| 3 | |
| 4 | (defpackage :charcode |
| 5 | (:use :cl) |
| 6 | (:export "MAKE-ENCODER" "MAKE-DECODER" "ENCODE-STRING" "DECODE-STRING" "SYSTEM-CHARSET" |
| 7 | "NO-CODEC-ERROR" "CODING-ERROR" |
| 8 | "MAKE-CODEC-CHARACTER-STREAM" |
| 9 | "ASCII" "LATIN-1" "LATIN1" "UTF-8" "UTF8")) |
| 10 | (in-package :charcode) |
| 11 | |
| 12 | ;;; General stuff |
| 13 | |
| 14 | (define-condition no-codec-error (error) |
| 15 | ((codec-name :initarg :codec-name)) |
| 16 | (:report (lambda (c s) |
| 17 | (with-slots (codec-name) c |
| 18 | (format s "Could find no codec named ~A." codec-name))))) |
| 19 | |
| 20 | (define-condition coding-error (error) |
| 21 | ((input :initarg :input) |
| 22 | (position :initarg :position) |
| 23 | (result :initarg :result))) |
| 24 | |
| 25 | (define-condition simple-coding-error (coding-error simple-error) ()) |
| 26 | |
| 27 | (defun coding-error (input position result format &rest format-args) |
| 28 | (error 'simple-coding-error |
| 29 | :input input :position position :result result |
| 30 | :format-control format :format-arguments format-args)) |
| 31 | |
| 32 | (deftype decoder-fun () `(function ((array (unsigned-byte 8)) |
| 33 | (array character) |
| 34 | &key (start fixnum) (end fixnum)) |
| 35 | (member t nil))) |
| 36 | (deftype encoder-fun () `(function ((array character) |
| 37 | (array (unsigned-byte 8)) |
| 38 | &key (start fixnum) (end fixnum)) |
| 39 | (member t nil))) |
| 40 | |
| 41 | (defmacro define-encoder ((name) &body make-encoder) |
| 42 | `(setf (get ',name 'make-encoder) #'(lambda () ,@make-encoder))) |
| 43 | |
| 44 | (defmacro define-decoder ((name) &body make-decoder) |
| 45 | `(setf (get ',name 'make-decoder) #'(lambda () ,@make-decoder))) |
| 46 | |
| 47 | (defmacro define-codec-synonyms (name &rest synonyms) |
| 48 | `(eval-when (:load-toplevel :execute) |
| 49 | ,@(mapcar #'(lambda (sym) |
| 50 | `(setf (get ',sym 'make-encoder) (get ',name 'make-encoder) |
| 51 | (get ',sym 'make-decoder) (get ',name 'make-decoder))) |
| 52 | synonyms))) |
| 53 | |
| 54 | (defun make-encoder (name) |
| 55 | (the encoder-fun (values (funcall (or (get name 'make-encoder) |
| 56 | (error 'no-codec-error :codec-name name)))))) |
| 57 | |
| 58 | (defun make-decoder (name) |
| 59 | (the decoder-fun (values (funcall (or (get name 'make-decoder) |
| 60 | (error 'no-codec-error :codec-name name)))))) |
| 61 | |
| 62 | (defun system-charset () |
| 63 | ;; XXX: Replace me with something perhaps more sensible. |
| 64 | 'utf-8) |
| 65 | |
| 66 | (defun encode-string (string &optional (coding (system-charset))) |
| 67 | (declare (type string string)) |
| 68 | (let ((encoder (make-encoder coding)) |
| 69 | (buf (make-array (list (length string)) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0))) |
| 70 | (unless (funcall encoder string buf) |
| 71 | (coding-error string (length string) buf "Encoding of string in ~A ended prematurely." coding)) |
| 72 | buf)) |
| 73 | |
| 74 | (defun decode-string (buffer &optional (coding (system-charset))) |
| 75 | (declare (type (array (unsigned-byte 8)) buffer)) |
| 76 | (let ((decoder (make-decoder coding)) |
| 77 | (buf (make-array (list (length buffer)) :element-type 'character :adjustable t :fill-pointer 0))) |
| 78 | (unless (funcall decoder buffer buf) |
| 79 | (coding-error buffer (length buffer) buf "~A byte sequence ended prematurely." coding)) |
| 80 | buf)) |
| 81 | |
| 82 | ;;; Gray stream implementation |
| 83 | |
| 84 | ;; Disabled for now. There doesn't seem to be any good way to get |
| 85 | ;; these working generally over various implementations. |
| 86 | |
| 87 | #+unused ( |
| 88 | (defclass codec-character-stream (fundamental-character-input-stream fundamental-character-output-stream) |
| 89 | ((decoder :initarg :decoder) |
| 90 | (encoder :initarg :encoder) |
| 91 | (back :initarg :back) |
| 92 | (read-pos :initform 0) |
| 93 | (buffer :initform (make-array '(64) :element-type 'character :adjustable t :fill-pointer 0)))) |
| 94 | |
| 95 | (defun make-codec-character-stream (real-stream &optional (charset (system-charset))) |
| 96 | (declare (type stream real-stream)) |
| 97 | (make-instance 'codec-character-stream :decoder (make-decoder charset) :encoder (make-encoder charset) :back real-stream)) |
| 98 | |
| 99 | (defmethod close ((stream codec-character-stream) &key abort) |
| 100 | (with-slots (back) stream |
| 101 | (close back :abort abort)) |
| 102 | (call-next-method)) |
| 103 | |
| 104 | (defmethod open-stream-p ((stream codec-character-stream)) |
| 105 | (with-slots (back) stream |
| 106 | (open-stream-p stream))) |
| 107 | |
| 108 | (defun ccs-ensure-buffer (stream len) |
| 109 | (declare (type codec-character-stream stream) |
| 110 | (type integer len)) |
| 111 | (with-slots (decoder back buffer read-pos) stream |
| 112 | (let ((readbuf (make-array (list len) :element-type '(unsigned-byte 8)))) |
| 113 | (loop (unless (< (- (length buffer) read-pos) len) (return t)) |
| 114 | (let ((readlen (read-sequence readbuf back :end (- len (- (length buffer) read-pos))))) |
| 115 | (when (= readlen 0) |
| 116 | (return-from ccs-ensure-buffer nil)) |
| 117 | (funcall decoder readbuf buffer :end readlen)))))) |
| 118 | |
| 119 | (defun ccs-clear-buffer (stream) |
| 120 | (declare (type codec-character-stream stream)) |
| 121 | (with-slots (read-pos buffer) stream |
| 122 | (replace buffer buffer :start2 read-pos) |
| 123 | (setf (fill-pointer buffer) (- (fill-pointer buffer) read-pos) |
| 124 | read-pos 0))) |
| 125 | |
| 126 | (defmethod stream-read-char ((stream codec-character-stream)) |
| 127 | (unless (ccs-ensure-buffer stream 1) |
| 128 | (return-from stream-read-char :eof)) |
| 129 | (with-slots (read-pos buffer) stream |
| 130 | (prog1 (aref buffer read-pos) |
| 131 | (when (>= (incf read-pos) 16) |
| 132 | (ccs-clear-buffer stream))))) |
| 133 | |
| 134 | (defmethod stream-unread-char ((stream codec-character-stream) char) |
| 135 | (with-slots (read-pos buffer) stream |
| 136 | (when (= read-pos 0) |
| 137 | (let ((len (length buffer))) |
| 138 | (when (< (array-dimension buffer 0) (+ len 16)) |
| 139 | (adjust-array buffer (list (setf (fill-pointer buffer) |
| 140 | (+ len 16))))) |
| 141 | (replace buffer buffer :start1 16 :end2 len))) |
| 142 | (setf (aref buffer (decf read-pos)) char) |
| 143 | nil)) |
| 144 | |
| 145 | (defun ccs-wont-hang-p (stream) |
| 146 | (declare (type codec-character-stream stream)) |
| 147 | (with-slots (read-pos back buffer) stream |
| 148 | (or (and (< read-pos (length buffer)) (aref buffer read-pos)) |
| 149 | (listen back)))) |
| 150 | |
| 151 | (defmethod stream-read-char-no-hang ((stream codec-character-stream)) |
| 152 | (if (ccs-wont-hang-p stream) |
| 153 | (stream-read-char stream) |
| 154 | nil)) |
| 155 | |
| 156 | (defmethod stream-peek-char ((stream codec-character-stream)) |
| 157 | (unless (ccs-ensure-buffer stream 1) |
| 158 | (return-from stream-peek-char :eof)) |
| 159 | (with-slots (read-pos buffer) stream |
| 160 | (aref buffer read-pos))) |
| 161 | |
| 162 | (defmethod stream-listen ((stream codec-character-stream)) |
| 163 | (if (ccs-wont-hang-p stream) |
| 164 | (let ((peek (stream-peek-char stream))) |
| 165 | (if (eq peek :eof) |
| 166 | nil |
| 167 | peek)) |
| 168 | nil)) |
| 169 | |
| 170 | (defmethod stream-write-char ((stream codec-character-stream) char) |
| 171 | (with-slots (encoder back) stream |
| 172 | (let ((seq (make-array '(1) :element-type 'character :initial-element char)) |
| 173 | (outbuf (make-array '(16) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0))) |
| 174 | (funcall encoder seq outbuf) |
| 175 | (write-sequence outbuf back)))) |
| 176 | |
| 177 | (defmethod stream-finish-output ((stream codec-character-stream)) |
| 178 | (finish-output (slot-value stream 'back))) |
| 179 | |
| 180 | (defmethod stream-force-output ((stream codec-character-stream)) |
| 181 | (force-output (slot-value stream 'back))) |
| 182 | |
| 183 | (defmethod stream-read-sequence ((stream codec-character-stream) seq &optional (start 0) (end (length seq))) |
| 184 | (ccs-ensure-buffer stream (- end start)) |
| 185 | (with-slots (read-pos buffer) stream |
| 186 | (replace seq buffer :start1 start :end1 end :start2 read-pos :end2 (length buffer)) |
| 187 | (let ((len (min (- end start) (- (length buffer) read-pos)))) |
| 188 | (when (>= (incf read-pos len) 128) |
| 189 | (ccs-clear-buffer stream))))) |
| 190 | |
| 191 | (defmethod stream-write-sequence ((stream codec-character-stream) seq &optional (start 0) (end (length seq))) |
| 192 | (with-slots (encoder back) stream |
| 193 | (let ((outbuf (make-array (list (- end start)) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0))) |
| 194 | (funcall encoder seq outbuf) |
| 195 | (write-sequence outbuf back)))) |
| 196 | ) |
| 197 | |
| 198 | ;;; Implementation-specific functions |
| 199 | |
| 200 | #+(or (and clisp unicode) sbcl abcl) |
| 201 | (defun unicode->char (unicode) |
| 202 | (declare (type (unsigned-byte 24) unicode)) |
| 203 | (code-char unicode)) |
| 204 | |
| 205 | #+(or (and clisp unicode) sbcl abcl) |
| 206 | (defun char->unicode (char) |
| 207 | (declare (type character char)) |
| 208 | (char-code char)) |
| 209 | |
| 210 | #+ecl |
| 211 | (defun unicode->char (unicode) |
| 212 | (declare (type (unsigned-byte 24) unicode)) |
| 213 | (when (>= unicode 256) |
| 214 | (error "ECL does not handle Unicode characters outside Latin-1.")) |
| 215 | (code-char unicode)) |
| 216 | |
| 217 | #+ecl |
| 218 | (defun char->unicode (char) |
| 219 | (declare (type character char)) |
| 220 | (char-code char)) |
| 221 | |
| 222 | ;;; ASCII |
| 223 | |
| 224 | (defun decode-ascii (byteseq charseq &key (start 0) (end (length byteseq))) |
| 225 | (declare (type (array (unsigned-byte 8)) byteseq) |
| 226 | (type (array character) charseq) |
| 227 | (type fixnum start end)) |
| 228 | (loop |
| 229 | (restart-case |
| 230 | (loop |
| 231 | (unless (< start end) (return-from decode-ascii t)) |
| 232 | (let ((byte (aref byteseq (prog1 start (incf start))))) |
| 233 | (unless (< byte 128) |
| 234 | (coding-error byteseq start charseq "Invalid byte ~D in ASCII stream." byte)) |
| 235 | (vector-push-extend (unicode->char byte) charseq))) |
| 236 | (:replace-char (&optional (replacement (unicode->char #xfffd))) |
| 237 | :report "Replace the invalid byte with a character." |
| 238 | (vector-push-extend replacement charseq)) |
| 239 | (:skip-char () |
| 240 | :report "Ignore the invalid byte." |
| 241 | nil)))) |
| 242 | |
| 243 | (defun encode-ascii (charseq byteseq &key (start 0) (end (length charseq))) |
| 244 | (declare (type (array (unsigned-byte 8)) byteseq) |
| 245 | (type (array character) charseq) |
| 246 | (type fixnum start end)) |
| 247 | (loop |
| 248 | (restart-case |
| 249 | (loop |
| 250 | (unless (< start end) (return-from encode-ascii t)) |
| 251 | (vector-push-extend (let ((cp (char->unicode (aref charseq (prog1 start (incf start)))))) |
| 252 | (unless (< cp 128) |
| 253 | (coding-error charseq start byteseq "ASCII cannot encode code-points higher than 128.")) |
| 254 | cp) |
| 255 | byteseq)) |
| 256 | (:replace-char (&optional (replacement #\?)) |
| 257 | :report "Replace this character with another." |
| 258 | (vector-push-extend (char->unicode replacement) byteseq)) |
| 259 | (:skip-char () |
| 260 | :report "Ignore this character." |
| 261 | nil)))) |
| 262 | |
| 263 | (define-decoder (ascii) |
| 264 | #'decode-ascii) |
| 265 | |
| 266 | (define-encoder (ascii) |
| 267 | #'encode-ascii) |
| 268 | |
| 269 | (define-codec-synonyms ascii :ascii) |
| 270 | |
| 271 | ;;; Latin-1 |
| 272 | |
| 273 | (defun decode-latin-1 (byteseq charseq &key (start 0) (end (length byteseq))) |
| 274 | (declare (type (array (unsigned-byte 8)) byteseq) |
| 275 | (type (array character) charseq) |
| 276 | (type fixnum start end)) |
| 277 | (do ((i start (1+ i))) |
| 278 | ((>= i end)) |
| 279 | (vector-push-extend (unicode->char (aref byteseq i)) charseq)) |
| 280 | t) |
| 281 | |
| 282 | (defun encode-latin-1 (charseq byteseq &key (start 0) (end (length charseq))) |
| 283 | (declare (type (array (unsigned-byte 8)) byteseq) |
| 284 | (type (array character) charseq) |
| 285 | (type fixnum start end)) |
| 286 | (loop |
| 287 | (restart-case |
| 288 | (loop |
| 289 | (unless (< start end) (return-from encode-latin-1 t)) |
| 290 | (vector-push-extend (let ((cp (char->unicode (aref charseq (prog1 start (incf start)))))) |
| 291 | (unless (< cp 256) |
| 292 | (coding-error charseq start byteseq "ISO-8859-1 cannot encode code-points higher than 256.")) |
| 293 | cp) |
| 294 | byteseq)) |
| 295 | (:replace-char (&optional (replacement #\?)) |
| 296 | :report "Replace this character with another." |
| 297 | (vector-push-extend (char->unicode replacement) byteseq)) |
| 298 | (:skip-char () |
| 299 | :report "Ignore this character." |
| 300 | nil)))) |
| 301 | |
| 302 | (define-decoder (latin-1) |
| 303 | #'decode-latin-1) |
| 304 | |
| 305 | (define-encoder (latin-1) |
| 306 | #'encode-latin-1) |
| 307 | |
| 308 | (define-codec-synonyms latin-1 latin1 iso-8859-1 :latin-1 :latin1 :iso-8859-1) |
| 309 | |
| 310 | ;;; UTF-8 |
| 311 | |
| 312 | (defun encode-utf-8 (charseq byteseq &key (start 0) (end (length charseq))) |
| 313 | (declare (type (array (unsigned-byte 8)) byteseq) |
| 314 | (type (array character) charseq) |
| 315 | (type fixnum start end)) |
| 316 | (do ((i start (1+ i))) |
| 317 | ((>= i end)) |
| 318 | (let ((cp (char->unicode (aref charseq i)))) |
| 319 | (if (< cp 128) |
| 320 | (vector-push-extend cp byteseq) |
| 321 | (let ((nbytes 0) |
| 322 | (bytes '())) |
| 323 | (loop |
| 324 | (push (logior (ldb (byte 6 0) cp) #x80) bytes) |
| 325 | (setf cp (truncate cp 64)) |
| 326 | (incf nbytes) |
| 327 | (when (< cp (expt 2 (- 6 nbytes))) |
| 328 | (push (logior (logand #xff (lognot (1- (expt 2 (- 7 nbytes))))) |
| 329 | cp) |
| 330 | bytes) |
| 331 | (return))) |
| 332 | (dolist (byte bytes) |
| 333 | (vector-push-extend byte byteseq)))))) |
| 334 | t) |
| 335 | |
| 336 | (define-encoder (utf-8) |
| 337 | #'encode-utf-8) |
| 338 | |
| 339 | (define-decoder (utf-8) |
| 340 | (let ((mbuf 0) |
| 341 | (mlen 0)) |
| 342 | (flet ((decode (byteseq charseq &key (start 0) (end (length byteseq))) |
| 343 | (declare (type (array (unsigned-byte 8)) byteseq) |
| 344 | (type (array character) charseq) |
| 345 | (type fixnum start end)) |
| 346 | (let ((i start)) |
| 347 | (flet ((failure (format &rest args) |
| 348 | (error 'simple-coding-error |
| 349 | :input byteseq :position i :result charseq |
| 350 | :format-control format :format-arguments args))) |
| 351 | (loop |
| 352 | (restart-case |
| 353 | (progn |
| 354 | (loop |
| 355 | (unless (< i end) (return)) |
| 356 | (let ((byte (aref byteseq (prog1 i (incf i))))) |
| 357 | (if (= mlen 0) |
| 358 | (if (< byte 128) |
| 359 | (vector-push-extend (unicode->char byte) charseq) |
| 360 | (setf mlen (block zero |
| 361 | (dotimes (i 7) |
| 362 | (when (= (ldb (byte 1 (- 7 i)) byte) 0) |
| 363 | (when (< i 2) |
| 364 | (failure "UTF-8 sequence started with continuation byte: ~D" byte)) |
| 365 | (return-from zero (1- i)))) |
| 366 | (failure "Invalid UTF-8 sequence start byte: ~D" byte)) |
| 367 | mbuf (ldb (byte (- 6 mlen) 0) byte))) |
| 368 | (progn (when (not (= (ldb (byte 2 6) byte) 2)) |
| 369 | (failure "Invalid UTF-8 continuation byte: ~D" byte)) |
| 370 | (setf mbuf (+ (* mbuf 64) (ldb (byte 6 0) byte))) |
| 371 | (when (= (decf mlen) 0) |
| 372 | (when (< mbuf 128) |
| 373 | (with-simple-restart (:accept "Accept anyway.") |
| 374 | (failure "UTF-8 multibyte sequence denoted an ASCII character ~S (either an encoding error or an attempt at breaking security)." (unicode->char mbuf)))) |
| 375 | (vector-push-extend (unicode->char mbuf) charseq)))))) |
| 376 | (return-from decode (= mlen 0))) |
| 377 | (:replace-char (&optional (replacement (unicode->char #xfffd))) |
| 378 | :report "Replace the invalid bytes with a character." |
| 379 | (vector-push-extend replacement charseq) |
| 380 | (loop (unless (and (< i end) (= (ldb (byte 2 6) (aref byteseq i)) 2)) |
| 381 | (return)) |
| 382 | (incf i)) |
| 383 | (setf mlen 0)) |
| 384 | (:skip-char () |
| 385 | :report "Ignore the invalid byte sequence." |
| 386 | (loop (unless (and (< i end) (= (ldb (byte 2 6) (aref byteseq i)) 2)) |
| 387 | (return)) |
| 388 | (incf i)) |
| 389 | (setf mlen 0)))))))) |
| 390 | #'decode))) |
| 391 | |
| 392 | (define-codec-synonyms utf-8 utf8 :utf-8 :utf8) |