decode.lisp - clic - Clic is an command line interactive client for gopher written in Common LISP (HTM) git clone git://bitreich.org/clic/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/clic/ (DIR) Log (DIR) Files (DIR) Refs (DIR) Tags (DIR) README (DIR) LICENSE --- decode.lisp (25456B) --- 1 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*- 2 ;;; $Header: /usr/local/cvsrep/flexi-streams/decode.lisp,v 1.35 2008/08/26 10:59:22 edi Exp $ 3 4 ;;; Copyright (c) 2005-2008, Dr. Edmund Weitz. All rights reserved. 5 6 ;;; Redistribution and use in source and binary forms, with or without 7 ;;; modification, are permitted provided that the following conditions 8 ;;; are met: 9 10 ;;; * Redistributions of source code must retain the above copyright 11 ;;; notice, this list of conditions and the following disclaimer. 12 13 ;;; * Redistributions in binary form must reproduce the above 14 ;;; copyright notice, this list of conditions and the following 15 ;;; disclaimer in the documentation and/or other materials 16 ;;; provided with the distribution. 17 18 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED 19 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 22 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 24 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 26 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 27 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30 (in-package :flexi-streams) 31 32 (defun recover-from-encoding-error (external-format format-control &rest format-args) 33 "Helper function used by OCTETS-TO-CHAR-CODE below to deal with 34 encoding errors. Checks if *SUBSTITUTION-CHAR* is not NIL and returns 35 its character code in this case. Otherwise signals an 36 EXTERNAL-FORMAT-ENCODING-ERROR as determined by the arguments to this 37 function and provides a corresponding USE-VALUE restart." 38 (when *substitution-char* 39 (return-from recover-from-encoding-error (char-code *substitution-char*))) 40 (restart-case 41 (apply #'signal-encoding-error external-format format-control format-args) 42 (use-value (char) 43 :report "Specify a character to be used instead." 44 :interactive (lambda () 45 (loop 46 (format *query-io* "Type a character: ") 47 (let ((line (read-line *query-io*))) 48 (when (= 1 (length line)) 49 (return (list (char line 0))))))) 50 (char-code char)))) 51 52 (defgeneric octets-to-char-code (format reader) 53 (declare #.*standard-optimize-settings*) 54 (:documentation "Converts a sequence of octets to a character code 55 \(which is returned, or NIL in case of EOF) using the external format 56 FORMAT. The sequence is obtained by calling the function \(which must 57 be a functional object) READER with no arguments which should return 58 one octet per call. In the case of EOF, READER should return NIL. 59 60 The special variable *CURRENT-UNREADER* must be bound correctly 61 whenever this function is called.")) 62 63 (defgeneric octets-to-string* (format sequence start end) 64 (declare #.*standard-optimize-settings*) 65 (:documentation "A generic function which dispatches on the external 66 format and does the real work for OCTETS-TO-STRING.")) 67 68 (defmethod octets-to-string* :around (format (list list) start end) 69 (declare #.*standard-optimize-settings*) 70 (octets-to-string* format (coerce list 'vector) start end)) 71 72 (defmacro define-sequence-readers ((format-class) &body body) 73 "Non-hygienic utility macro which defines methods for READ-SEQUENCE* 74 and OCTETS-TO-STRING* for the class FORMAT-CLASS. BODY is described 75 in the docstring of DEFINE-CHAR-ENCODERS but can additionally contain 76 a form \(UNGET <form>) which has to be replaced by the correct code to 77 `unread' the octets for the character designated by <form>." 78 (let* ((body `((block char-decoder 79 (locally 80 (declare #.*fixnum-optimize-settings*) 81 ,@body))))) 82 `(progn 83 (defmethod read-sequence* ((format ,format-class) flexi-input-stream sequence start end) 84 (with-accessors ((position flexi-stream-position) 85 (bound flexi-stream-bound) 86 (octet-stack flexi-stream-octet-stack) 87 (last-octet flexi-stream-last-octet) 88 (last-char-code flexi-stream-last-char-code) 89 (stream flexi-stream-stream)) 90 flexi-input-stream 91 (let* (buffer 92 (buffer-pos 0) 93 (buffer-end 0) 94 (index start) 95 donep 96 ;; whether we will later be able to rewind the stream if 97 ;; needed (to get rid of unused octets in the buffer) 98 (can-rewind-p (maybe-rewind stream 0)) 99 (factor (encoding-factor format)) 100 (integer-factor (floor factor)) 101 ;; it's an interesting question whether it makes sense 102 ;; performance-wise to make RESERVE significantly bigger 103 ;; (and thus put potentially a lot more octets into 104 ;; OCTET-STACK), especially for UTF-8 105 (reserve (cond ((or (not (floatp factor)) 106 (not can-rewind-p)) 0) 107 (t (ceiling (* (- factor integer-factor) (- end start))))))) 108 (declare (fixnum buffer-pos buffer-end index integer-factor reserve) 109 (boolean can-rewind-p)) 110 (flet ((compute-fill-amount () 111 "Computes the amount of octets we can savely read into 112 the buffer without violating the stream's bound \(if there is one) and 113 without potentially reading much more than we need \(unless we can 114 rewind afterwards)." 115 (let ((minimum (min (the fixnum (+ (the fixnum (* integer-factor 116 (the fixnum (- end index)))) 117 reserve)) 118 +buffer-size+))) 119 (cond (bound (min minimum (- bound position))) 120 (t minimum)))) 121 (fill-buffer (end) 122 "Tries to fill the buffer from BUFFER-POS to END and 123 returns NIL if the buffer doesn't contain any new data." 124 (when donep 125 (return-from fill-buffer nil)) 126 ;; put data from octet stack into buffer if there is any 127 (loop 128 (when (>= buffer-pos end) 129 (return)) 130 (let ((next-octet (pop octet-stack))) 131 (cond (next-octet 132 (setf (aref (the (array octet *) buffer) buffer-pos) (the octet next-octet)) 133 (incf buffer-pos)) 134 (t (return))))) 135 (setq buffer-end (read-sequence buffer stream 136 :start buffer-pos 137 :end end)) 138 ;; we reached EOF, so we remember this 139 (when (< buffer-end end) 140 (setq donep t)) 141 ;; BUFFER-POS is only greater than zero if the buffer 142 ;; already contains unread data from the octet stack 143 ;; (see below), so we test for ZEROP here and do /not/ 144 ;; compare with BUFFER-POS 145 (unless (zerop buffer-end) 146 (incf position buffer-end)))) 147 (let ((minimum (compute-fill-amount))) 148 (declare (fixnum minimum)) 149 (setq buffer (make-octet-buffer minimum)) 150 ;; fill buffer for the first time or return immediately if 151 ;; we don't succeed 152 (unless (fill-buffer minimum) 153 (return-from read-sequence* start))) 154 (setq buffer-pos 0) 155 (macrolet ((iterate (set-place) 156 "A very unhygienic macro to implement the 157 actual iteration through the sequence including housekeeping for the 158 flexi stream. SET-PLACE is the place \(using the index INDEX) used to 159 access the sequence." 160 `(flet ((leave () 161 "This is the function used to 162 abort the LOOP iteration below." 163 (when (> index start) 164 (setq last-octet nil 165 last-char-code ,(sublis '((index . (1- index))) set-place))) 166 (return-from read-sequence* index))) 167 (loop 168 (when (>= index end) 169 ;; check if there are octets in the 170 ;; buffer we didn't use - see 171 ;; COMPUTE-FILL-AMOUNT above 172 (let ((rest (- buffer-end buffer-pos))) 173 (when (plusp rest) 174 (or (and can-rewind-p 175 (maybe-rewind stream rest)) 176 (loop 177 (when (>= buffer-pos buffer-end) 178 (return)) 179 (decf buffer-end) 180 (push (aref (the (array octet *) buffer) buffer-end) 181 octet-stack))))) 182 (leave)) 183 (let ((next-char-code 184 (progn (symbol-macrolet 185 ((octet-getter 186 ;; this is the code to retrieve the next octet (or 187 ;; NIL) and to fill the buffer if needed 188 (block next-octet 189 (when (>= buffer-pos buffer-end) 190 (setq buffer-pos 0) 191 (unless (fill-buffer (compute-fill-amount)) 192 (return-from next-octet))) 193 (prog1 194 (aref (the (array octet *) buffer) buffer-pos) 195 (incf buffer-pos))))) 196 (macrolet ((unget (form) 197 `(unread-char% ,form flexi-input-stream))) 198 ,',@body))))) 199 (unless next-char-code 200 (leave)) 201 (setf ,set-place (code-char next-char-code)) 202 (incf index)))))) 203 (etypecase sequence 204 (string (iterate (char sequence index))) 205 (array (iterate (aref sequence index))) 206 (list (iterate (nth index sequence))))))))) 207 (defmethod octets-to-string* ((format ,format-class) sequence start end) 208 (declare #.*standard-optimize-settings*) 209 (declare (fixnum start end)) 210 (let* ((i start) 211 (string-length (compute-number-of-chars format sequence start end)) 212 (string (make-array string-length :element-type 'char*))) 213 (declare (fixnum i string-length)) 214 (loop for j of-type fixnum from 0 below string-length 215 do (setf (schar string j) 216 (code-char (macrolet ((unget (form) 217 `(decf i (character-length format ,form)))) 218 (symbol-macrolet ((octet-getter (and (< i end) 219 (prog1 220 (the octet (aref sequence i)) 221 (incf i))))) 222 ,@body)))) 223 finally (return string))))))) 224 225 (defmacro define-char-decoders ((lf-format-class cr-format-class crlf-format-class) &body body) 226 "Non-hygienic utility macro which defines several decoding-related 227 methods for the classes LF-FORMAT-CLASS, CR-FORMAT-CLASS, and 228 CRLF-FORMAT-CLASS where it is assumed that CR-FORMAT-CLASS is the same 229 encoding as LF-FORMAT-CLASS but with CR instead of LF line endings and 230 similar for CRLF-FORMAT-CLASS, i.e. LF-FORMAT-CLASS is the base class. 231 BODY is a code template for the code to read octets and return one 232 character code. BODY must contain a symbol OCTET-GETTER representing 233 the form which is used to obtain the next octet." 234 (let* ((body (with-unique-names (char-code) 235 `((let ((,char-code (progn ,@body))) 236 (when (and ,char-code 237 (or (<= #xd8 (logand* #x00ff (ash* ,char-code -8)) #xdf) 238 (> ,char-code #x10ffff))) 239 (recover-from-encoding-error format "Illegal code point ~A \(#x~:*~X)." ,char-code)) 240 ,char-code))))) 241 `(progn 242 (defmethod octets-to-char-code ((format ,lf-format-class) reader) 243 (declare #.*fixnum-optimize-settings*) 244 (declare (function reader)) 245 (symbol-macrolet ((octet-getter (funcall reader))) 246 ,@(sublis '((char-decoder . octets-to-char-code)) 247 body))) 248 (define-sequence-readers (,lf-format-class) ,@body) 249 (define-sequence-readers (,cr-format-class) 250 ,(with-unique-names (char-code) 251 `(let ((,char-code (progn ,@body))) 252 (case ,char-code 253 (#.+cr+ #.(char-code #\Newline)) 254 (otherwise ,char-code))))) 255 (define-sequence-readers (,crlf-format-class) 256 ,(with-unique-names (char-code next-char-code get-char-code) 257 `(flet ((,get-char-code () ,@body)) 258 (let ((,char-code (,get-char-code))) 259 (case ,char-code 260 (#.+cr+ 261 (let ((,next-char-code (,get-char-code))) 262 (case ,next-char-code 263 (#.+lf+ #.(char-code #\Newline)) 264 ;; we saw a CR but no LF afterwards, but then the data 265 ;; ended, so we just return #\Return 266 ((nil) +cr+) 267 ;; if the character we peeked at wasn't a 268 ;; linefeed character we unread its constituents 269 (otherwise (unget (code-char ,next-char-code)) 270 ,char-code)))) 271 (otherwise ,char-code))))))))) 272 273 (define-char-decoders (flexi-latin-1-format flexi-cr-latin-1-format flexi-crlf-latin-1-format) 274 octet-getter) 275 276 (define-char-decoders (flexi-ascii-format flexi-cr-ascii-format flexi-crlf-ascii-format) 277 (when-let (octet octet-getter) 278 (if (> (the octet octet) 127) 279 (recover-from-encoding-error format 280 "No character which corresponds to octet #x~X." octet) 281 octet))) 282 283 (define-char-decoders (flexi-8-bit-format flexi-cr-8-bit-format flexi-crlf-8-bit-format) 284 (with-accessors ((decoding-table external-format-decoding-table)) 285 format 286 (when-let (octet octet-getter) 287 (let ((char-code (aref (the (simple-array char-code-integer *) decoding-table) 288 (the octet octet)))) 289 (if (or (null char-code) 290 (= (the char-code-integer char-code) 65533)) 291 (recover-from-encoding-error format 292 "No character which corresponds to octet #x~X." octet) 293 char-code))))) 294 295 (define-char-decoders (flexi-utf-8-format flexi-cr-utf-8-format flexi-crlf-utf-8-format) 296 (let (first-octet-seen) 297 (declare (boolean first-octet-seen)) 298 (macrolet ((read-next-byte () 299 '(prog1 300 (or octet-getter 301 (cond (first-octet-seen 302 (return-from char-decoder 303 (recover-from-encoding-error format 304 "End of data while in UTF-8 sequence."))) 305 (t (return-from char-decoder nil)))) 306 (setq first-octet-seen t)))) 307 (flet ((recover-from-overlong-sequence (value) 308 (restart-case 309 (recover-from-encoding-error format "`Overlong' UTF-8 sequence for code point #x~X." 310 value) 311 (accept-overlong-sequence () 312 :report "Accept the code point and continue." 313 value)))) 314 (let ((octet (read-next-byte))) 315 (declare (type octet octet)) 316 (block utf-8-sequence 317 (multiple-value-bind (start count) 318 (cond ((not (logbitp 7 octet)) 319 ;; avoid the overlong checks below 320 (return-from utf-8-sequence octet)) 321 ((= #b11000000 (logand* octet #b11100000)) 322 (values (logand* octet #b00011111) 1)) 323 ((= #b11100000 (logand* octet #b11110000)) 324 (values (logand* octet #b00001111) 2)) 325 ((= #b11110000 (logand* octet #b11111000)) 326 (values (logand* octet #b00000111) 3)) 327 (t (return-from char-decoder 328 (recover-from-encoding-error format 329 "Unexpected value #x~X at start of UTF-8 sequence." 330 octet)))) 331 (declare (fixnum count)) 332 (loop for result of-type code-point 333 = start then (+ (ash* result 6) 334 (logand* octet #b111111)) 335 repeat count 336 for octet of-type octet = (read-next-byte) 337 unless (= #b10000000 (logand* octet #b11000000)) 338 do (return-from char-decoder 339 (recover-from-encoding-error format 340 "Unexpected value #x~X in UTF-8 sequence." octet)) 341 finally (return (cond ((< result (ecase count 342 (1 #x00080) 343 (2 #x00800) 344 (3 #x10000))) 345 (recover-from-overlong-sequence result)) 346 (t result))))))))))) 347 348 (define-char-decoders (flexi-utf-16-le-format flexi-cr-utf-16-le-format flexi-crlf-utf-16-le-format) 349 (let (first-octet-seen) 350 (declare (boolean first-octet-seen)) 351 (macrolet ((read-next-byte () 352 '(prog1 353 (or octet-getter 354 (cond (first-octet-seen 355 (return-from char-decoder 356 (recover-from-encoding-error format 357 "End of data while in UTF-16 sequence."))) 358 (t (return-from char-decoder nil)))) 359 (setq first-octet-seen t)))) 360 (flet ((read-next-word () 361 (+ (the octet (read-next-byte)) 362 (ash* (the octet (read-next-byte)) 8)))) 363 (declare (inline read-next-word)) 364 (let ((word (read-next-word))) 365 (declare (type (unsigned-byte 16) word)) 366 (cond ((<= #xd800 word #xdfff) 367 (let ((next-word (read-next-word))) 368 (declare (type (unsigned-byte 16) next-word)) 369 (unless (<= #xdc00 next-word #xdfff) 370 (return-from char-decoder 371 (recover-from-encoding-error format 372 "Unexpected UTF-16 word #x~X following #x~X." 373 next-word word))) 374 (+ (ash* (logand* #b1111111111 word) 10) 375 (logand* #b1111111111 next-word) 376 #x10000))) 377 (t word))))))) 378 379 (define-char-decoders (flexi-utf-16-be-format flexi-cr-utf-16-be-format flexi-crlf-utf-16-be-format) 380 (let (first-octet-seen) 381 (declare (boolean first-octet-seen)) 382 (macrolet ((read-next-byte () 383 '(prog1 384 (or octet-getter 385 (cond (first-octet-seen 386 (return-from char-decoder 387 (recover-from-encoding-error format 388 "End of data while in UTF-16 sequence."))) 389 (t (return-from char-decoder nil)))) 390 (setq first-octet-seen t)))) 391 (flet ((read-next-word () 392 (+ (ash* (the octet (read-next-byte)) 8) 393 (the octet (read-next-byte))))) 394 (declare (inline read-next-word)) 395 (let ((word (read-next-word))) 396 (declare (type (unsigned-byte 16) word)) 397 (cond ((<= #xd800 word #xdfff) 398 (let ((next-word (read-next-word))) 399 (declare (type (unsigned-byte 16) next-word)) 400 (unless (<= #xdc00 next-word #xdfff) 401 (return-from char-decoder 402 (recover-from-encoding-error format 403 "Unexpected UTF-16 word #x~X following #x~X." 404 next-word word))) 405 (+ (ash* (logand* #b1111111111 word) 10) 406 (logand* #b1111111111 next-word) 407 #x10000))) 408 (t word))))))) 409 410 (define-char-decoders (flexi-utf-32-le-format flexi-cr-utf-32-le-format flexi-crlf-utf-32-le-format) 411 (let (first-octet-seen) 412 (declare (boolean first-octet-seen)) 413 (macrolet ((read-next-byte () 414 '(prog1 415 (or octet-getter 416 (cond (first-octet-seen 417 (return-from char-decoder 418 (recover-from-encoding-error format 419 "End of data while in UTF-32 sequence."))) 420 (t (return-from char-decoder nil)))) 421 (setq first-octet-seen t)))) 422 (loop for count of-type fixnum from 0 to 24 by 8 423 for octet of-type octet = (read-next-byte) 424 sum (ash* octet count))))) 425 426 (define-char-decoders (flexi-utf-32-be-format flexi-cr-utf-32-be-format flexi-crlf-utf-32-be-format) 427 (let (first-octet-seen) 428 (declare (boolean first-octet-seen)) 429 (macrolet ((read-next-byte () 430 '(prog1 431 (or octet-getter 432 (cond (first-octet-seen 433 (return-from char-decoder 434 (recover-from-encoding-error format 435 "End of data while in UTF-32 sequence."))) 436 (t (return-from char-decoder nil)))) 437 (setq first-octet-seen t)))) 438 (loop for count of-type fixnum from 24 downto 0 by 8 439 for octet of-type octet = (read-next-byte) 440 sum (ash* octet count))))) 441 442 (defmethod octets-to-char-code ((format flexi-cr-mixin) reader) 443 (declare #.*fixnum-optimize-settings*) 444 (declare (ignore reader)) 445 (let ((char-code (call-next-method))) 446 (case char-code 447 (#.+cr+ #.(char-code #\Newline)) 448 (otherwise char-code)))) 449 450 (defmethod octets-to-char-code ((format flexi-crlf-mixin) reader) 451 (declare #.*fixnum-optimize-settings*) 452 (declare (function *current-unreader*)) 453 (declare (ignore reader)) 454 (let ((char-code (call-next-method))) 455 (case char-code 456 (#.+cr+ 457 (let ((next-char-code (call-next-method))) 458 (case next-char-code 459 (#.+lf+ #.(char-code #\Newline)) 460 ;; we saw a CR but no LF afterwards, but then the data 461 ;; ended, so we just return #\Return 462 ((nil) +cr+) 463 ;; if the character we peeked at wasn't a 464 ;; linefeed character we unread its constituents 465 (otherwise (funcall *current-unreader* (code-char next-char-code)) 466 char-code)))) 467 (otherwise char-code)))) 468