strings.lisp - clic - Clic is an command line interactive client for gopher written in Common LISP
 (HTM) git clone git://bitreich.org/clic/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/clic/
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) Tags
 (DIR) README
 (DIR) LICENSE
       ---
       strings.lisp (15562B)
       ---
            1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*-
            2 ;;;
            3 ;;; strings.lisp --- Conversions between strings and UB8 vectors.
            4 ;;;
            5 ;;; Copyright (C) 2007, Luis Oliveira  <loliveira@common-lisp.net>
            6 ;;;
            7 ;;; Permission is hereby granted, free of charge, to any person
            8 ;;; obtaining a copy of this software and associated documentation
            9 ;;; files (the "Software"), to deal in the Software without
           10 ;;; restriction, including without limitation the rights to use, copy,
           11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies
           12 ;;; of the Software, and to permit persons to whom the Software is
           13 ;;; furnished to do so, subject to the following conditions:
           14 ;;;
           15 ;;; The above copyright notice and this permission notice shall be
           16 ;;; included in all copies or substantial portions of the Software.
           17 ;;;
           18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
           19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
           20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
           21 ;;; NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
           22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
           23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
           24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
           25 ;;; DEALINGS IN THE SOFTWARE.
           26 
           27 (in-package #:babel)
           28 
           29 ;;; The usefulness of this string/octets interface of Babel's is very
           30 ;;; limited on Lisps with 8-bit characters which will in effect only
           31 ;;; support the latin-1 subset of Unicode.  That is, all encodings are
           32 ;;; supported but we can only store the first 256 code points in Lisp
           33 ;;; strings.  Support for using other 8-bit encodings for strings on
           34 ;;; these Lisps could be added with an extra encoding/decoding step.
           35 ;;; Supporting other encodings with larger code units would be silly
           36 ;;; (it would break expectations about common string operations) and
           37 ;;; better done with something like Closure's runes.
           38 
           39 ;;; Can we handle unicode fully?
           40 (eval-when (:compile-toplevel :load-toplevel :execute)
           41   ;; The EVAL is just here to avoid warnings...
           42   (case (eval char-code-limit)
           43     (#x100 (pushnew '8-bit-chars *features*))
           44     (#x10000 (pushnew 'ucs-2-chars *features*))
           45     (#x110000 #| yay |#)
           46     ;; This is here mostly because if the CHAR-CODE-LIMIT is bigger
           47     ;; than #x11000, strange things might happen but we probably
           48     ;; shouldn't descriminate against other, smaller, values.
           49     (t (error "Strange CHAR-CODE-LIMIT (#x~X), bailing out."
           50               char-code-limit))))
           51 
           52 ;;; Adapted from Ironclad.  TODO: check if it's worthwhile adding
           53 ;;; implementation-specific accessors such as SAP-REF-* for SBCL.
           54 (defmacro ub-get (vector index &optional (bytes 1) (endianness :ne))
           55   (let ((big-endian (member endianness
           56                             '(:be #+big-endian :ne #+little-endian :re))))
           57     (once-only (vector index)
           58       `(logand
           59         ,(1- (ash 1 (* 8 bytes)))
           60         (logior
           61          ,@(loop for i from 0 below bytes
           62                  for offset = (if big-endian i (- bytes i 1))
           63                  for shift = (if big-endian
           64                                  (* (- bytes i 1) 8)
           65                                  (* offset 8))
           66                  collect `(ash (aref ,vector (+ ,index ,offset)) ,shift)))))))
           67 
           68 (defmacro ub-set (value vector index &optional (bytes 1) (endianness :ne))
           69   (let ((big-endian (member endianness
           70                             '(:be #+big-endian :ne #+little-endian :re))))
           71     `(progn
           72        ,@(loop for i from 1 to bytes
           73                for offset = (if big-endian (- bytes i) (1- i)) collect
           74                `(setf (aref ,vector (+ ,index ,offset))
           75                       (ldb (byte 8 ,(* 8 (1- i))) ,value)))
           76        (values))))
           77 
           78 (defmacro string-get (string index)
           79   `(char-code (schar ,string ,index)))
           80 
           81 (defmacro string-set (code string index)
           82   `(setf (schar ,string ,index) (code-char ,code)))
           83 
           84 ;;; SIMPLE-BASE-STRING would also be a subtype of SIMPLE-STRING so we
           85 ;;; don't use that because on SBCL BASE-CHARs can only hold ASCII.
           86 ;;; Also, with (> SPEED SAFETY) (setf (schar base-str n) big-char)
           87 ;;; will quietly work, sort of.
           88 ;;;
           89 ;;; XXX: test this on various lisps.
           90 
           91 (defconstant unicode-char-code-limit
           92   char-code-limit
           93   "An alias for CL:CHAR-CODE-LIMIT which might be lower than
           94 #x110000 on some Lisps.")
           95 
           96 (deftype unicode-char ()
           97   "This character type can hold any characters whose CHAR-CODEs
           98 are less than UNICODE-CHAR-CODE-LIMIT."
           99   #+lispworks 'lw:simple-char
          100   #-lispworks 'character)
          101 
          102 (deftype simple-unicode-string ()
          103   "Alias for (SIMPLE-ARRAY UNICODE-CHAR (*))."
          104   '(simple-array unicode-char (*)))
          105 
          106 (deftype unicode-string ()
          107   "Alias for (VECTOR UNICODE-CHAR *)."
          108   '(vector unicode-char *))
          109 
          110 (defparameter *string-vector-mappings*
          111   (instantiate-concrete-mappings
          112    ;; :optimize ((speed 3) (safety 0) (debug 0) (compilation-speed 0))
          113    :octet-seq-setter ub-set
          114    :octet-seq-getter ub-get
          115    :octet-seq-type (simple-array (unsigned-byte 8) (*))
          116    :code-point-seq-setter string-set
          117    :code-point-seq-getter string-get
          118    :code-point-seq-type simple-unicode-string))
          119 
          120 #+sbcl
          121 (defparameter *simple-base-string-vector-mappings*
          122   (instantiate-concrete-mappings
          123    ;; :optimize ((speed 3) (safety 0) (debug 0) (compilation-speed 0))
          124    :instantiate-decoders nil
          125    :octet-seq-setter ub-set
          126    :octet-seq-getter ub-get
          127    :octet-seq-type (simple-array (unsigned-byte 8) (*))
          128    :code-point-seq-setter string-set
          129    :code-point-seq-getter string-get
          130    :code-point-seq-type simple-base-string))
          131 
          132 ;;; Do we want a more a specific error condition here?
          133 (defun check-vector-bounds (vector start end)
          134   (unless (<= 0 start end (length vector))
          135     (error "Invalid start (~A) and end (~A) values for vector of length ~A."
          136            start end (length vector))))
          137 
          138 (defmacro with-simple-vector (((v vector) (s start) (e end)) &body body)
          139   "If VECTOR is a displaced or adjustable array, binds V to the
          140 underlying simple vector, adds an adequate offset to START and
          141 END and binds those offset values to S and E.  Otherwise, if
          142 VECTOR is already a simple array, it's simply bound to V with no
          143 further changes.
          144 
          145 START and END are unchecked and assumed to be within bounds.
          146 
          147 Note that in some Lisps, a slow copying implementation is
          148 necessary to obtain a simple vector thus V will be bound to a
          149 copy of VECTOR coerced to a simple-vector.  Therefore, you
          150 shouldn't attempt to modify V."
          151   #+sbcl
          152   `(sb-kernel:with-array-data ((,v ,vector) (,s ,start) (,e ,end))
          153      ,@body)
          154   #+(or cmu scl)
          155   `(lisp::with-array-data ((,v ,vector) (,s ,start) (,e ,end))
          156      ,@body)
          157   #+openmcl
          158   (with-unique-names (offset)
          159     `(multiple-value-bind (,v ,offset)
          160          (ccl::array-data-and-offset ,vector)
          161        (let ((,s (+ ,start ,offset))
          162              (,e (+ ,end ,offset)))
          163          ,@body)))
          164   #+allegro
          165   (with-unique-names (offset)
          166     `(excl::with-underlying-simple-vector (,vector ,v ,offset)
          167        (let ((,e (+ ,end ,offset))
          168              (,s (+ ,start ,offset)))
          169          ,@body)))
          170   ;; slow, copying implementation
          171   #-(or sbcl cmu scl openmcl allegro)
          172   (once-only (vector)
          173     `(funcall (if (adjustable-array-p ,vector)
          174                   #'call-with-array-data/copy
          175                   #'call-with-array-data/fast)
          176               ,vector ,start ,end
          177               (lambda (,v ,s ,e) ,@body))))
          178 
          179 #-(or sbcl cmu scl openmcl allegro)
          180 (progn
          181   ;; Stolen from f2cl.
          182   (defun array-data-and-offset (array)
          183     (loop with offset = 0 do
          184           (multiple-value-bind (displaced-to index-offset)
          185               (array-displacement array)
          186             (when (null displaced-to)
          187               (return-from array-data-and-offset
          188                 (values array offset)))
          189             (incf offset index-offset)
          190             (setf array displaced-to))))
          191 
          192   (defun call-with-array-data/fast (vector start end fn)
          193     (multiple-value-bind (data offset)
          194         (array-data-and-offset vector)
          195       (funcall fn data (+ offset start) (+ offset end))))
          196 
          197   (defun call-with-array-data/copy (vector start end fn)
          198     (funcall fn (replace (make-array (- end start) :element-type
          199                                      (array-element-type vector))
          200                          vector :start2 start :end2 end)
          201              0 (- end start))))
          202 
          203 (defmacro with-checked-simple-vector (((v vector) (s start) (e end)) &body body)
          204   "Like WITH-SIMPLE-VECTOR but bound-checks START and END."
          205   (once-only (vector start)
          206     `(let ((,e (or ,end (length ,vector))))
          207        (check-vector-bounds ,vector ,start ,e)
          208        (with-simple-vector ((,v ,vector) (,s ,start) (,e ,e))
          209          ,@body))))
          210 
          211 ;;; Future features these functions should have:
          212 ;;;
          213 ;;;   * null-terminate
          214 ;;;   * specify target vector/string + offset
          215 ;;;   * documentation :)
          216 
          217 (declaim (inline octets-to-string string-to-octets string-size-in-octets
          218                  vector-size-in-chars concatenate-strings-to-octets
          219                  bom-vector))
          220 
          221 (defun octets-to-string (vector &key (start 0) end
          222                          (errorp (not *suppress-character-coding-errors*))
          223                          (encoding *default-character-encoding*))
          224   (check-type vector (vector (unsigned-byte 8)))
          225   (with-checked-simple-vector ((vector vector) (start start) (end end))
          226     (declare (type (simple-array (unsigned-byte 8) (*)) vector))
          227     (let ((*suppress-character-coding-errors* (not errorp))
          228           (mapping (lookup-mapping *string-vector-mappings* encoding)))
          229       (multiple-value-bind (size new-end)
          230           (funcall (code-point-counter mapping) vector start end -1)
          231         ;; TODO we could optimize ASCII here: the result should
          232         ;; be a simple-base-string filled using code-char...
          233         (let ((string (make-string size :element-type 'unicode-char)))
          234           (funcall (decoder mapping) vector start new-end string 0)
          235           string)))))
          236 
          237 (defun bom-vector (encoding use-bom)
          238   (check-type use-bom (member :default t nil))
          239   (the simple-vector
          240     (if (null use-bom)
          241         #()
          242         (let ((enc (typecase encoding
          243                      (external-format (external-format-encoding encoding))
          244                      (t (get-character-encoding encoding)))))
          245           (if (or (eq use-bom t)
          246                   (and (eq use-bom :default) (enc-use-bom enc)))
          247               ;; VALUES avoids a "type assertion too complex to check" note.
          248               (values (enc-bom-encoding enc))
          249               #())))))
          250 
          251 (defun string-to-octets (string &key (encoding *default-character-encoding*)
          252                          (start 0) end (use-bom :default)
          253                          (errorp (not *suppress-character-coding-errors*)))
          254   (declare (optimize (speed 3) (safety 2)))
          255   (let ((*suppress-character-coding-errors* (not errorp)))
          256     (etypecase string
          257       ;; On some lisps (e.g. clisp and ccl) all strings are BASE-STRING and all
          258       ;; characters are BASE-CHAR. So, only enable this optimization for
          259       ;; selected targets.
          260       #+sbcl
          261       (simple-base-string
          262        (unless end
          263          (setf end (length string)))
          264        (check-vector-bounds string start end)
          265        (let* ((mapping (lookup-mapping *simple-base-string-vector-mappings*
          266                                        encoding))
          267               (bom (bom-vector encoding use-bom))
          268               (bom-length (length bom))
          269               ;; OPTIMIZE: we could use the (length string) information here
          270               ;; because it's a simple-base-string where each character <= 127
          271               (result (make-array
          272                        (+ (the array-index
          273                             (funcall (the function (octet-counter mapping))
          274                                      string start end -1))
          275                           bom-length)
          276                        :element-type '(unsigned-byte 8))))
          277          (replace result bom)
          278          (funcall (the function (encoder mapping))
          279                   string start end result bom-length)
          280          result))
          281       (string
          282        ;; FIXME: we shouldn't really need that coercion to UNICODE-STRING
          283        ;; but we kind of because it's declared all over.  To avoid that,
          284        ;; we'd need different types for input and output strings.  Or maybe
          285        ;; this is not a problem; figure that out.
          286        (with-checked-simple-vector ((string (coerce string 'unicode-string))
          287                                     (start start) (end end))
          288          (declare (type simple-unicode-string string))
          289          (let* ((mapping (lookup-mapping *string-vector-mappings* encoding))
          290                 (bom (bom-vector encoding use-bom))
          291                 (bom-length (length bom))
          292                 (result (make-array
          293                          (+ (the array-index
          294                               (funcall (the function (octet-counter mapping))
          295                                        string start end -1))
          296                             bom-length)
          297                          :element-type '(unsigned-byte 8))))
          298            (replace result bom)
          299            (funcall (the function (encoder mapping))
          300                     string start end result bom-length)
          301            result))))))
          302 
          303 (defun concatenate-strings-to-octets (encoding &rest strings)
          304   "Optimized equivalent of
          305 \(string-to-octets \(apply #'concatenate 'string strings)
          306                   :encoding encoding)"
          307   (declare (dynamic-extent strings))
          308   (let* ((mapping (lookup-mapping *string-vector-mappings* encoding))
          309          (octet-counter (octet-counter mapping))
          310          (vector (make-array
          311                   (the array-index
          312                     (reduce #'+ strings
          313                             :key (lambda (string)
          314                                    (funcall octet-counter
          315                                             string 0 (length string) -1))))
          316                   :element-type '(unsigned-byte 8)))
          317          (current-index 0))
          318     (declare (type array-index current-index))
          319     (dolist (string strings)
          320       (check-type string string)
          321       (with-checked-simple-vector ((string (coerce string 'unicode-string))
          322                                    (start 0) (end (length string)))
          323         (declare (type simple-unicode-string string))
          324         (incf current-index
          325               (funcall (encoder mapping)
          326                        string start end vector current-index))))
          327     vector))
          328 
          329 (defun string-size-in-octets (string &key (start 0) end (max -1 maxp)
          330                               (errorp (not *suppress-character-coding-errors*))
          331                               (encoding *default-character-encoding*))
          332   (check-type string string)
          333   (with-checked-simple-vector ((string (coerce string 'unicode-string))
          334                                (start start) (end end))
          335     (declare (type simple-unicode-string string))
          336     (let ((mapping (lookup-mapping *string-vector-mappings* encoding))
          337           (*suppress-character-coding-errors* (not errorp)))
          338       (when maxp (assert (plusp max)))
          339       (funcall (octet-counter mapping) string start end max))))
          340 
          341 (defun vector-size-in-chars (vector &key (start 0) end (max -1 maxp)
          342                              (errorp (not *suppress-character-coding-errors*))
          343                              (encoding *default-character-encoding*))
          344   (check-type vector (vector (unsigned-byte 8)))
          345   (with-checked-simple-vector ((vector vector) (start start) (end end))
          346     (declare (type (simple-array (unsigned-byte 8) (*)) vector))
          347     (let ((mapping (lookup-mapping *string-vector-mappings* encoding))
          348           (*suppress-character-coding-errors* (not errorp)))
          349       (when maxp (assert (plusp max)))
          350       (funcall (code-point-counter mapping) vector start end max))))
          351 
          352 (declaim (notinline octets-to-string string-to-octets string-size-in-octets
          353                     vector-size-in-chars concatenate-strings-to-octets))