enc-gbk.lisp - clic - Clic is an command line interactive client for gopher written in Common LISP
 (HTM) git clone git://bitreich.org/clic/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/clic/
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) Tags
 (DIR) README
 (DIR) LICENSE
       ---
       enc-gbk.lisp (8297B)
       ---
            1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*-
            2 ;;;
            3 ;;; enc-gbk.lisp --- GBK encodings.
            4 ;;;
            5 ;;; Copyright (C) 2011, Li Wenpeng  <levin108@gmail.com>
            6 ;;;
            7 ;;; Permission is hereby granted, free of charge, to any person
            8 ;;; obtaining a copy of this software and associated documentation
            9 ;;; files (the "Software"), to deal in the Software without
           10 ;;; restriction, including without limitation the rights to use, copy,
           11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies
           12 ;;; of the Software, and to permit persons to whom the Software is
           13 ;;; furnished to do so, subject to the following conditions:
           14 ;;;
           15 ;;; The above copyright notice and this permission notice shall be
           16 ;;; included in all copies or substantial portions of the Software.
           17 ;;;
           18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
           19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
           20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
           21 ;;; NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
           22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
           23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
           24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
           25 ;;; DEALINGS IN THE SOFTWARE.
           26 
           27 (in-package #:babel-encodings)
           28 
           29 ;; populated in gbk-map.lisp
           30 (defvar *gbk-unicode-mapping*)
           31 
           32 (defconstant +gbk2-offset+ 0)
           33 (defconstant +gbk3-offset+ 6763)
           34 (defconstant +gbk4-offset+ (+ 6763 6080))
           35 (defconstant +gbk1-offset+ 20902)
           36 (defconstant +gbk5-offset+ (+ 20902 846))
           37 
           38 (define-character-encoding :gbk
           39     "GBK is an extension of the GB2312 character set for simplified
           40 Chinese characters, used in the People's Republic of China."
           41   :max-units-per-char 4
           42   :literal-char-code-limit #x80)
           43 
           44 (define-condition invalid-gbk-byte (character-decoding-error)
           45   ()
           46   (:documentation "Signalled when an invalid GBK byte is found."))
           47 
           48 (define-condition invalid-gbk-character (character-encoding-error)
           49   ()
           50   (:documentation "Signalled when an invalid GBK character is found."))
           51 
           52 (define-octet-counter :gbk (getter type)
           53   `(lambda (seq start end max)
           54      (declare (type ,type seq) (fixnum start end max))
           55      (let ((noctets 0))
           56        (loop for i from start below end
           57              for u1 of-type code-point = (,getter seq i)
           58              do (cond ((< u1 #x80) (incf noctets))
           59                       (t (incf noctets 2)))
           60                 (when (and (plusp max) (= noctets max))
           61                   (return (values noctets i)))
           62              finally (return (values noctets i))))))
           63 
           64 (define-code-point-counter :gbk (getter type)
           65   `(lambda (seq start end max)
           66      (declare (type ,type seq))
           67      (let (u1 (noctets 0))
           68        (loop with i = start
           69              while (< i end)
           70              do (setf u1 (,getter seq i))
           71                 (cond
           72                   ((eq 0 (logand u1 #x80)) (incf i))
           73                   (t (incf i 2)))
           74                 (incf noctets)
           75                 (when (and (plusp max) (= noctets max))
           76                   (return (values noctets i)))
           77              finally (return (values noctets i))))))
           78 
           79 (define-encoder :gbk (getter src-type setter dest-type)
           80   `(lambda (src start end dest d-start)
           81      (declare (type ,src-type src)
           82               (type ,dest-type dest)
           83               (fixnum start end d-start))
           84      (macrolet
           85          ((do-encoding (index)
           86             `(let ((u1 0) (u2 0))
           87                (cond
           88                  ((<= +gbk2-offset+ ,index (- +gbk3-offset+ 1)) ; gbk/2
           89                   (setf u1 (+ #xB0 (truncate (/ ,index 94))))
           90                   (setf u2 (+ #xA1 (mod ,index 94))))
           91                  ((<= +gbk3-offset+ ,index (- +gbk4-offset+ 1)) ; gbk/3
           92                   (setf index (- ,index +gbk3-offset+))
           93                   (setf u1 (+ #x81 (truncate (/ ,index 190))))
           94                   (setf u2 (+ #x40 (mod ,index 190)))
           95                   (if (>= u2 #x7F) (incf u2)))
           96                  ((<= +gbk4-offset+ ,index (- +gbk1-offset+ 1)) ; gbk/4
           97                   (setf index (- ,index +gbk4-offset+))
           98                   (setf u1 (+ #xAA (truncate (/ ,index 96))))
           99                   (setf u2 (+ #x40 (mod ,index 96)))
          100                   (if (>= u2 #x7F) (incf u2)))
          101                  ((<= +gbk1-offset+ ,index (- +gbk5-offset+ 1)) ; gbk/1
          102                   (setf index (- ,index +gbk1-offset+))
          103                   (setf u1 (+ #xA1 (truncate (/ ,index 94))))
          104                   (setf u2 (+ #xA1 (mod ,index 94))))
          105                  ((<= +gbk5-offset+ ,index (length *gbk-unicode-mapping*)) ; gbk/5
          106                   (setf index (- ,index +gbk5-offset+))
          107                   (setf u1 (+ #xA8 (truncate (/ ,index 96))))
          108                   (setf u2 (+ #x40 (mod ,index 96)))
          109                   (if (>= u2 #x7F) (incf u2))))
          110                (values u1 u2))))
          111        (let ((c 0) index (noctets 0))
          112          (loop for i from start below end
          113                for code of-type code-point = (,getter src i)
          114                do (macrolet
          115                       ((handle-error (&optional (c 'character-encoding-error))
          116                          `(encoding-error code :gbk src i +repl+ ',c)))
          117                     (setf c (code-char code))
          118                     (cond
          119                       ((< code #x80)    ; ascii
          120                        (,setter code dest noctets)
          121                        (incf noctets))
          122                       (t                ; gbk
          123                        (setf index
          124                              (position c *gbk-unicode-mapping*))
          125 
          126                        (if (not index)
          127                            (handle-error invalid-gbk-character))
          128                        (multiple-value-bind (uh ul) (do-encoding index)
          129                          (,setter uh dest noctets)
          130                          (,setter ul dest (+ 1 noctets))
          131                          (incf noctets 2)))))
          132                finally (return (the fixnum (- noctets d-start))))))))
          133 
          134 (define-decoder :gbk (getter src-type setter dest-type)
          135   `(lambda (src start end dest d-start)
          136      (declare (type ,src-type src)
          137               (type ,dest-type dest))
          138      (let ((u1 0) (u2 0) (index 0) (tmp 0) (noctets 0))
          139        (loop with i = start
          140              while (< i end)
          141              do (macrolet
          142                     ((handle-error (&optional (c 'character-decoding-error))
          143                        `(decoding-error #(u1 u2) :gbk src i +repl+ ',c)))
          144                   (setf u1 (,getter src i))
          145                   (incf i)
          146                   (cond
          147                     ((eq 0 (logand u1 #x80))
          148                      (,setter u1 dest noctets))
          149                     (t
          150                      (setf u2 (,getter src i))
          151                      (incf i)
          152                      (setf index
          153                            (block setter-block
          154                              (cond
          155                                ((and (<= #xB0 u1 #xF7) (<= #xA1 u2 #xFE))
          156                                 (+ +gbk2-offset+ (+ (* 94 (- u1 #xB0)) (- u2 #xA1))))
          157 
          158                                ((and (<= #x81 u1 #xA0) (<= #x40 u2 #xFE))
          159                                 (cond ((> u2 #x7F) (setf tmp 1))
          160                                       (t (setf tmp 0)))
          161                                 (+ +gbk3-offset+ (* 190 (- u1 #x81)) (- u2 #x40 tmp)))
          162 
          163                                ((and (<= #xAA u1 #xFE) (<= #x40 #xA0))
          164                                 (cond ((> u2 #x7F) (setf tmp 1))
          165                                       (t (setf tmp 0)))
          166                                 (+ +gbk4-offset+ (* 96 (- u1 #xAA)) (- u2 #x40 tmp)))
          167 
          168                                ((and (<= #xA1 u1 #xA9) (<= #xA1 u2 #xFE))
          169                                 (+ +gbk1-offset+ (* 94 (- u1 #xA1)) (- u2 #xA1)))
          170 
          171                                ((and (<= #xA8 u1 #xA9) (<= #x40 #xA0))
          172                                 (cond ((> u2 #x7F) (setf tmp 1))
          173                                       (t (setf tmp 0)))
          174                                 (+ +gbk5-offset+ (* 96 (- u1 #xA8)) (- u2 #x40 tmp)))
          175                                (t
          176                                 (handle-error invalid-gbk-byte)))))
          177 
          178                      (when (>= index (length *gbk-unicode-mapping*))
          179                        (handle-error invalid-gbk-byte))
          180                      (,setter (char-code
          181                                (elt *gbk-unicode-mapping* index))
          182                               dest noctets)))
          183                   (incf noctets))
          184              finally (return (the fixnum (- noctets d-start)))))))