bitreich.org

       decode.lisp - clic - Clic is an command line interactive client for gopher written in Common LISP
 (HTM) git clone git://bitreich.org/clic/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/clic/
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) Tags
 (DIR) README
 (DIR) LICENSE
       ---
       decode.lisp (25456B)
       ---
            1 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*-
            2 ;;; $Header: /usr/local/cvsrep/flexi-streams/decode.lisp,v 1.35 2008/08/26 10:59:22 edi Exp $
            3 
            4 ;;; Copyright (c) 2005-2008, Dr. Edmund Weitz.  All rights reserved.
            5 
            6 ;;; Redistribution and use in source and binary forms, with or without
            7 ;;; modification, are permitted provided that the following conditions
            8 ;;; are met:
            9 
           10 ;;;   * Redistributions of source code must retain the above copyright
           11 ;;;     notice, this list of conditions and the following disclaimer.
           12 
           13 ;;;   * Redistributions in binary form must reproduce the above
           14 ;;;     copyright notice, this list of conditions and the following
           15 ;;;     disclaimer in the documentation and/or other materials
           16 ;;;     provided with the distribution.
           17 
           18 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
           19 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
           20 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
           21 ;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
           22 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
           23 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
           24 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
           25 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
           26 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
           27 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
           28 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
           29 
           30 (in-package :flexi-streams)
           31 
           32 (defun recover-from-encoding-error (external-format format-control &rest format-args)
           33   "Helper function used by OCTETS-TO-CHAR-CODE below to deal with
           34 encoding errors.  Checks if *SUBSTITUTION-CHAR* is not NIL and returns
           35 its character code in this case.  Otherwise signals an
           36 EXTERNAL-FORMAT-ENCODING-ERROR as determined by the arguments to this
           37 function and provides a corresponding USE-VALUE restart."
           38   (when *substitution-char*
           39     (return-from recover-from-encoding-error (char-code *substitution-char*)))
           40   (restart-case
           41       (apply #'signal-encoding-error external-format format-control format-args)
           42     (use-value (char)
           43       :report "Specify a character to be used instead."
           44       :interactive (lambda ()
           45                      (loop
           46                       (format *query-io* "Type a character: ")
           47                       (let ((line (read-line *query-io*)))
           48                         (when (= 1 (length line))
           49                           (return (list (char line 0)))))))
           50       (char-code char))))
           51 
           52 (defgeneric octets-to-char-code (format reader)
           53   (declare #.*standard-optimize-settings*)
           54   (:documentation "Converts a sequence of octets to a character code
           55 \(which is returned, or NIL in case of EOF) using the external format
           56 FORMAT.  The sequence is obtained by calling the function \(which must
           57 be a functional object) READER with no arguments which should return
           58 one octet per call.  In the case of EOF, READER should return NIL.
           59 
           60 The special variable *CURRENT-UNREADER* must be bound correctly
           61 whenever this function is called."))
           62 
           63 (defgeneric octets-to-string* (format sequence start end)
           64   (declare #.*standard-optimize-settings*)
           65   (:documentation "A generic function which dispatches on the external
           66 format and does the real work for OCTETS-TO-STRING."))
           67 
           68 (defmethod octets-to-string* :around (format (list list) start end)
           69   (declare #.*standard-optimize-settings*)
           70   (octets-to-string* format (coerce list 'vector) start end))
           71 
           72 (defmacro define-sequence-readers ((format-class) &body body)
           73   "Non-hygienic utility macro which defines methods for READ-SEQUENCE*
           74 and OCTETS-TO-STRING* for the class FORMAT-CLASS.  BODY is described
           75 in the docstring of DEFINE-CHAR-ENCODERS but can additionally contain
           76 a form \(UNGET <form>) which has to be replaced by the correct code to
           77 `unread' the octets for the character designated by <form>."
           78   (let* ((body `((block char-decoder
           79                    (locally
           80                      (declare #.*fixnum-optimize-settings*)
           81                      ,@body)))))
           82     `(progn
           83        (defmethod read-sequence* ((format ,format-class) flexi-input-stream sequence start end)
           84          (with-accessors ((position flexi-stream-position)
           85                           (bound flexi-stream-bound)
           86                           (octet-stack flexi-stream-octet-stack)
           87                           (last-octet flexi-stream-last-octet)
           88                           (last-char-code flexi-stream-last-char-code)
           89                           (stream flexi-stream-stream))
           90              flexi-input-stream
           91            (let* (buffer
           92                   (buffer-pos 0)
           93                   (buffer-end 0)
           94                   (index start)
           95                   donep
           96                   ;; whether we will later be able to rewind the stream if
           97                   ;; needed (to get rid of unused octets in the buffer)
           98                   (can-rewind-p (maybe-rewind stream 0))
           99                   (factor (encoding-factor format))
          100                   (integer-factor (floor factor))
          101                   ;; it's an interesting question whether it makes sense
          102                   ;; performance-wise to make RESERVE significantly bigger
          103                   ;; (and thus put potentially a lot more octets into
          104                   ;; OCTET-STACK), especially for UTF-8
          105                   (reserve (cond ((or (not (floatp factor))
          106                                       (not can-rewind-p)) 0)
          107                                  (t (ceiling (* (- factor integer-factor) (- end start)))))))
          108              (declare (fixnum buffer-pos buffer-end index integer-factor reserve)
          109                       (boolean can-rewind-p))
          110              (flet ((compute-fill-amount ()
          111                       "Computes the amount of octets we can savely read into
          112 the buffer without violating the stream's bound \(if there is one) and
          113 without potentially reading much more than we need \(unless we can
          114 rewind afterwards)."
          115                       (let ((minimum (min (the fixnum (+ (the fixnum (* integer-factor
          116                                                                         (the fixnum (- end index))))
          117                                                          reserve))
          118                                           +buffer-size+)))
          119                         (cond (bound (min minimum (- bound position)))
          120                               (t minimum))))
          121                     (fill-buffer (end)
          122                       "Tries to fill the buffer from BUFFER-POS to END and
          123 returns NIL if the buffer doesn't contain any new data."
          124                       (when donep
          125                         (return-from fill-buffer nil))
          126                       ;; put data from octet stack into buffer if there is any
          127                       (loop
          128                        (when (>= buffer-pos end)
          129                          (return))
          130                        (let ((next-octet (pop octet-stack)))
          131                          (cond (next-octet
          132                                 (setf (aref (the (array octet *) buffer) buffer-pos) (the octet next-octet))
          133                                 (incf buffer-pos))
          134                                (t (return)))))
          135                       (setq buffer-end (read-sequence buffer stream
          136                                                       :start buffer-pos
          137                                                       :end end))
          138                       ;; we reached EOF, so we remember this
          139                       (when (< buffer-end end)
          140                         (setq donep t))
          141                       ;; BUFFER-POS is only greater than zero if the buffer
          142                       ;; already contains unread data from the octet stack
          143                       ;; (see below), so we test for ZEROP here and do /not/
          144                       ;; compare with BUFFER-POS
          145                       (unless (zerop buffer-end)
          146                         (incf position buffer-end))))
          147                (let ((minimum (compute-fill-amount)))
          148                  (declare (fixnum minimum))
          149                  (setq buffer (make-octet-buffer minimum))
          150                  ;; fill buffer for the first time or return immediately if
          151                  ;; we don't succeed
          152                  (unless (fill-buffer minimum)
          153                    (return-from read-sequence* start)))
          154                (setq buffer-pos 0)
          155                (macrolet ((iterate (set-place)
          156                             "A very unhygienic macro to implement the
          157 actual iteration through the sequence including housekeeping for the
          158 flexi stream.  SET-PLACE is the place \(using the index INDEX) used to
          159 access the sequence."
          160                             `(flet ((leave ()
          161                                       "This is the function used to
          162 abort the LOOP iteration below."
          163                                       (when (> index start)
          164                                         (setq last-octet nil
          165                                               last-char-code ,(sublis '((index . (1- index))) set-place)))
          166                                       (return-from read-sequence* index)))
          167                                (loop
          168                                 (when (>= index end)
          169                                   ;; check if there are octets in the
          170                                   ;; buffer we didn't use - see
          171                                   ;; COMPUTE-FILL-AMOUNT above
          172                                   (let ((rest (- buffer-end buffer-pos)))
          173                                     (when (plusp rest)
          174                                       (or (and can-rewind-p
          175                                                (maybe-rewind stream rest))
          176                                           (loop
          177                                            (when (>= buffer-pos buffer-end)
          178                                              (return))
          179                                            (decf buffer-end)
          180                                            (push (aref (the (array octet *) buffer) buffer-end)
          181                                                  octet-stack)))))
          182                                   (leave))
          183                                 (let ((next-char-code
          184                                        (progn (symbol-macrolet
          185                                                   ((octet-getter
          186                                                     ;; this is the code to retrieve the next octet (or
          187                                                     ;; NIL) and to fill the buffer if needed
          188                                                     (block next-octet
          189                                                       (when (>= buffer-pos buffer-end)
          190                                                         (setq buffer-pos 0)
          191                                                         (unless (fill-buffer (compute-fill-amount))
          192                                                           (return-from next-octet)))
          193                                                       (prog1
          194                                                           (aref (the (array octet *) buffer) buffer-pos)
          195                                                         (incf buffer-pos)))))
          196                                                 (macrolet ((unget (form)
          197                                                              `(unread-char% ,form flexi-input-stream)))
          198                                                   ,',@body)))))
          199                                   (unless next-char-code
          200                                     (leave))
          201                                   (setf ,set-place (code-char next-char-code))
          202                                   (incf index))))))
          203                  (etypecase sequence
          204                    (string (iterate (char sequence index)))
          205                    (array (iterate (aref sequence index)))
          206                    (list (iterate (nth index sequence)))))))))
          207        (defmethod octets-to-string* ((format ,format-class) sequence start end)
          208          (declare #.*standard-optimize-settings*)
          209          (declare (fixnum start end))
          210          (let* ((i start)
          211                 (string-length (compute-number-of-chars format sequence start end))
          212                 (string (make-array string-length :element-type 'char*)))
          213            (declare (fixnum i string-length))
          214            (loop for j of-type fixnum from 0 below string-length
          215                  do (setf (schar string j)
          216                           (code-char (macrolet ((unget (form)
          217                                                   `(decf i (character-length format ,form))))
          218                                        (symbol-macrolet ((octet-getter (and (< i end)
          219                                                                             (prog1
          220                                                                                 (the octet (aref sequence i))
          221                                                                               (incf i)))))
          222                                          ,@body))))
          223                  finally (return string)))))))
          224 
          225 (defmacro define-char-decoders ((lf-format-class cr-format-class crlf-format-class) &body body)
          226   "Non-hygienic utility macro which defines several decoding-related
          227 methods for the classes LF-FORMAT-CLASS, CR-FORMAT-CLASS, and
          228 CRLF-FORMAT-CLASS where it is assumed that CR-FORMAT-CLASS is the same
          229 encoding as LF-FORMAT-CLASS but with CR instead of LF line endings and
          230 similar for CRLF-FORMAT-CLASS, i.e. LF-FORMAT-CLASS is the base class.
          231 BODY is a code template for the code to read octets and return one
          232 character code.  BODY must contain a symbol OCTET-GETTER representing
          233 the form which is used to obtain the next octet."
          234   (let* ((body (with-unique-names (char-code)
          235                  `((let ((,char-code (progn ,@body)))
          236                      (when (and ,char-code
          237                                 (or (<= #xd8 (logand* #x00ff (ash* ,char-code -8)) #xdf)
          238                                     (> ,char-code #x10ffff)))
          239                        (recover-from-encoding-error format "Illegal code point ~A \(#x~:*~X)." ,char-code))
          240                      ,char-code)))))
          241     `(progn
          242        (defmethod octets-to-char-code ((format ,lf-format-class) reader)
          243          (declare #.*fixnum-optimize-settings*)
          244          (declare (function reader))
          245          (symbol-macrolet ((octet-getter (funcall reader)))
          246            ,@(sublis '((char-decoder . octets-to-char-code))
          247                      body)))
          248        (define-sequence-readers (,lf-format-class) ,@body)
          249        (define-sequence-readers (,cr-format-class)
          250          ,(with-unique-names (char-code)
          251             `(let ((,char-code (progn ,@body)))
          252                (case ,char-code
          253                  (#.+cr+ #.(char-code #\Newline))
          254                  (otherwise ,char-code)))))
          255        (define-sequence-readers  (,crlf-format-class)
          256          ,(with-unique-names (char-code next-char-code get-char-code)
          257             `(flet ((,get-char-code () ,@body))
          258                (let ((,char-code (,get-char-code)))
          259                  (case ,char-code
          260                    (#.+cr+
          261                     (let ((,next-char-code (,get-char-code)))
          262                       (case ,next-char-code
          263                         (#.+lf+ #.(char-code #\Newline))
          264                         ;; we saw a CR but no LF afterwards, but then the data
          265                         ;; ended, so we just return #\Return
          266                         ((nil) +cr+)
          267                         ;; if the character we peeked at wasn't a
          268                         ;; linefeed character we unread its constituents
          269                         (otherwise (unget (code-char ,next-char-code))
          270                                    ,char-code))))
          271                    (otherwise ,char-code)))))))))
          272 
          273 (define-char-decoders (flexi-latin-1-format flexi-cr-latin-1-format flexi-crlf-latin-1-format)
          274   octet-getter)
          275 
          276 (define-char-decoders (flexi-ascii-format flexi-cr-ascii-format flexi-crlf-ascii-format)
          277   (when-let (octet octet-getter)
          278     (if (> (the octet octet) 127)
          279       (recover-from-encoding-error format
          280                                    "No character which corresponds to octet #x~X." octet)
          281       octet)))
          282 
          283 (define-char-decoders (flexi-8-bit-format flexi-cr-8-bit-format flexi-crlf-8-bit-format)
          284   (with-accessors ((decoding-table external-format-decoding-table))
          285       format
          286     (when-let (octet octet-getter)
          287       (let ((char-code (aref (the (simple-array char-code-integer *) decoding-table)
          288                              (the octet octet))))
          289         (if (or (null char-code)
          290                 (= (the char-code-integer char-code) 65533))
          291           (recover-from-encoding-error format
          292                                        "No character which corresponds to octet #x~X." octet)
          293           char-code)))))
          294 
          295 (define-char-decoders (flexi-utf-8-format flexi-cr-utf-8-format flexi-crlf-utf-8-format)
          296   (let (first-octet-seen)
          297     (declare (boolean first-octet-seen))
          298     (macrolet ((read-next-byte ()
          299                  '(prog1
          300                       (or octet-getter
          301                           (cond (first-octet-seen
          302                                  (return-from char-decoder
          303                                    (recover-from-encoding-error format
          304                                                                 "End of data while in UTF-8 sequence.")))
          305                                 (t (return-from char-decoder nil))))
          306                     (setq first-octet-seen t))))
          307       (flet ((recover-from-overlong-sequence (value)
          308                (restart-case
          309                    (recover-from-encoding-error format "`Overlong' UTF-8 sequence for code point #x~X."
          310                                                 value)                 
          311                  (accept-overlong-sequence ()
          312                    :report "Accept the code point and continue."
          313                    value))))
          314         (let ((octet (read-next-byte)))
          315           (declare (type octet octet))
          316           (block utf-8-sequence
          317             (multiple-value-bind (start count)
          318                 (cond ((not (logbitp 7 octet))
          319                        ;; avoid the overlong checks below
          320                        (return-from utf-8-sequence octet))
          321                       ((= #b11000000 (logand* octet #b11100000))
          322                        (values (logand* octet #b00011111) 1))
          323                       ((= #b11100000 (logand* octet #b11110000))
          324                        (values (logand* octet #b00001111) 2))
          325                       ((= #b11110000 (logand* octet #b11111000))
          326                        (values (logand* octet #b00000111) 3))
          327                       (t (return-from char-decoder
          328                            (recover-from-encoding-error format
          329                                                         "Unexpected value #x~X at start of UTF-8 sequence."
          330                                                         octet))))
          331               (declare (fixnum count))
          332               (loop for result of-type code-point
          333                     = start then (+ (ash* result 6)
          334                                     (logand* octet #b111111))
          335                     repeat count
          336                     for octet of-type octet = (read-next-byte)
          337                     unless (= #b10000000 (logand* octet #b11000000))
          338                     do (return-from char-decoder
          339                          (recover-from-encoding-error format
          340                                                       "Unexpected value #x~X in UTF-8 sequence." octet))
          341                     finally (return (cond ((< result (ecase count
          342                                                        (1 #x00080)
          343                                                        (2 #x00800)
          344                                                        (3 #x10000)))
          345                                            (recover-from-overlong-sequence result))
          346                                           (t result)))))))))))
          347 
          348 (define-char-decoders (flexi-utf-16-le-format flexi-cr-utf-16-le-format flexi-crlf-utf-16-le-format)
          349   (let (first-octet-seen)
          350     (declare (boolean first-octet-seen))
          351     (macrolet ((read-next-byte ()
          352                  '(prog1
          353                       (or octet-getter
          354                           (cond (first-octet-seen
          355                                  (return-from char-decoder
          356                                    (recover-from-encoding-error format
          357                                                                 "End of data while in UTF-16 sequence.")))
          358                                 (t (return-from char-decoder nil))))
          359                     (setq first-octet-seen t))))
          360       (flet ((read-next-word ()
          361                (+ (the octet (read-next-byte))
          362                   (ash* (the octet (read-next-byte)) 8))))
          363         (declare (inline read-next-word))
          364         (let ((word (read-next-word)))
          365           (declare (type (unsigned-byte 16) word))
          366           (cond ((<= #xd800 word #xdfff)
          367                  (let ((next-word (read-next-word)))
          368                    (declare (type (unsigned-byte 16) next-word))
          369                    (unless (<= #xdc00 next-word #xdfff)
          370                      (return-from char-decoder
          371                        (recover-from-encoding-error format
          372                                                     "Unexpected UTF-16 word #x~X following #x~X."
          373                                                     next-word word)))
          374                    (+ (ash* (logand* #b1111111111 word) 10)
          375                       (logand* #b1111111111 next-word)
          376                       #x10000)))
          377                 (t word)))))))
          378 
          379 (define-char-decoders (flexi-utf-16-be-format flexi-cr-utf-16-be-format flexi-crlf-utf-16-be-format)
          380   (let (first-octet-seen)
          381     (declare (boolean first-octet-seen))
          382     (macrolet ((read-next-byte ()
          383                  '(prog1
          384                       (or octet-getter
          385                           (cond (first-octet-seen
          386                                  (return-from char-decoder
          387                                    (recover-from-encoding-error format
          388                                                                 "End of data while in UTF-16 sequence.")))
          389                                 (t (return-from char-decoder nil))))
          390                     (setq first-octet-seen t))))
          391       (flet ((read-next-word ()
          392                (+ (ash* (the octet (read-next-byte)) 8)
          393                   (the octet (read-next-byte)))))
          394         (declare (inline read-next-word))
          395         (let ((word (read-next-word)))
          396           (declare (type (unsigned-byte 16) word))
          397           (cond ((<= #xd800 word #xdfff)
          398                  (let ((next-word (read-next-word)))
          399                    (declare (type (unsigned-byte 16) next-word))
          400                    (unless (<= #xdc00 next-word #xdfff)
          401                      (return-from char-decoder
          402                        (recover-from-encoding-error format
          403                                                     "Unexpected UTF-16 word #x~X following #x~X."
          404                                                     next-word word)))
          405                    (+ (ash* (logand* #b1111111111 word) 10)
          406                       (logand* #b1111111111 next-word)
          407                       #x10000)))
          408                 (t word)))))))
          409 
          410 (define-char-decoders (flexi-utf-32-le-format flexi-cr-utf-32-le-format flexi-crlf-utf-32-le-format)
          411   (let (first-octet-seen)
          412     (declare (boolean first-octet-seen))
          413     (macrolet ((read-next-byte ()
          414                  '(prog1
          415                       (or octet-getter
          416                           (cond (first-octet-seen
          417                                  (return-from char-decoder
          418                                    (recover-from-encoding-error format
          419                                                                 "End of data while in UTF-32 sequence.")))
          420                                 (t (return-from char-decoder nil))))
          421                     (setq first-octet-seen t))))
          422       (loop for count of-type fixnum from 0 to 24 by 8
          423             for octet of-type octet = (read-next-byte)
          424             sum (ash* octet count)))))
          425 
          426 (define-char-decoders (flexi-utf-32-be-format flexi-cr-utf-32-be-format flexi-crlf-utf-32-be-format)
          427   (let (first-octet-seen)
          428     (declare (boolean first-octet-seen))
          429     (macrolet ((read-next-byte ()
          430                  '(prog1
          431                       (or octet-getter
          432                           (cond (first-octet-seen
          433                                  (return-from char-decoder
          434                                    (recover-from-encoding-error format
          435                                                                 "End of data while in UTF-32 sequence.")))
          436                                 (t (return-from char-decoder nil))))
          437                     (setq first-octet-seen t))))
          438       (loop for count of-type fixnum from 24 downto 0 by 8
          439             for octet of-type octet = (read-next-byte)
          440             sum (ash* octet count)))))
          441 
          442 (defmethod octets-to-char-code ((format flexi-cr-mixin) reader)
          443   (declare #.*fixnum-optimize-settings*)
          444   (declare (ignore reader))
          445   (let ((char-code (call-next-method)))
          446     (case char-code
          447       (#.+cr+ #.(char-code #\Newline))
          448       (otherwise char-code))))
          449 
          450 (defmethod octets-to-char-code ((format flexi-crlf-mixin) reader)
          451   (declare #.*fixnum-optimize-settings*)
          452   (declare (function *current-unreader*))
          453   (declare (ignore reader))
          454   (let ((char-code (call-next-method)))
          455     (case char-code
          456       (#.+cr+
          457        (let ((next-char-code (call-next-method)))
          458          (case next-char-code
          459            (#.+lf+ #.(char-code #\Newline))
          460            ;; we saw a CR but no LF afterwards, but then the data
          461            ;; ended, so we just return #\Return
          462            ((nil) +cr+)
          463            ;; if the character we peeked at wasn't a
          464            ;; linefeed character we unread its constituents
          465            (otherwise (funcall *current-unreader* (code-char next-char-code))
          466                       char-code))))
          467       (otherwise char-code))))
          468