enc-gbk.lisp - clic - Clic is an command line interactive client for gopher written in Common LISP (HTM) git clone git://bitreich.org/clic/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/clic/ (DIR) Log (DIR) Files (DIR) Refs (DIR) Tags (DIR) README (DIR) LICENSE --- enc-gbk.lisp (8297B) --- 1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*- 2 ;;; 3 ;;; enc-gbk.lisp --- GBK encodings. 4 ;;; 5 ;;; Copyright (C) 2011, Li Wenpeng <levin108@gmail.com> 6 ;;; 7 ;;; Permission is hereby granted, free of charge, to any person 8 ;;; obtaining a copy of this software and associated documentation 9 ;;; files (the "Software"), to deal in the Software without 10 ;;; restriction, including without limitation the rights to use, copy, 11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies 12 ;;; of the Software, and to permit persons to whom the Software is 13 ;;; furnished to do so, subject to the following conditions: 14 ;;; 15 ;;; The above copyright notice and this permission notice shall be 16 ;;; included in all copies or substantial portions of the Software. 17 ;;; 18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 ;;; DEALINGS IN THE SOFTWARE. 26 27 (in-package #:babel-encodings) 28 29 ;; populated in gbk-map.lisp 30 (defvar *gbk-unicode-mapping*) 31 32 (defconstant +gbk2-offset+ 0) 33 (defconstant +gbk3-offset+ 6763) 34 (defconstant +gbk4-offset+ (+ 6763 6080)) 35 (defconstant +gbk1-offset+ 20902) 36 (defconstant +gbk5-offset+ (+ 20902 846)) 37 38 (define-character-encoding :gbk 39 "GBK is an extension of the GB2312 character set for simplified 40 Chinese characters, used in the People's Republic of China." 41 :max-units-per-char 4 42 :literal-char-code-limit #x80) 43 44 (define-condition invalid-gbk-byte (character-decoding-error) 45 () 46 (:documentation "Signalled when an invalid GBK byte is found.")) 47 48 (define-condition invalid-gbk-character (character-encoding-error) 49 () 50 (:documentation "Signalled when an invalid GBK character is found.")) 51 52 (define-octet-counter :gbk (getter type) 53 `(lambda (seq start end max) 54 (declare (type ,type seq) (fixnum start end max)) 55 (let ((noctets 0)) 56 (loop for i from start below end 57 for u1 of-type code-point = (,getter seq i) 58 do (cond ((< u1 #x80) (incf noctets)) 59 (t (incf noctets 2))) 60 (when (and (plusp max) (= noctets max)) 61 (return (values noctets i))) 62 finally (return (values noctets i)))))) 63 64 (define-code-point-counter :gbk (getter type) 65 `(lambda (seq start end max) 66 (declare (type ,type seq)) 67 (let (u1 (noctets 0)) 68 (loop with i = start 69 while (< i end) 70 do (setf u1 (,getter seq i)) 71 (cond 72 ((eq 0 (logand u1 #x80)) (incf i)) 73 (t (incf i 2))) 74 (incf noctets) 75 (when (and (plusp max) (= noctets max)) 76 (return (values noctets i))) 77 finally (return (values noctets i)))))) 78 79 (define-encoder :gbk (getter src-type setter dest-type) 80 `(lambda (src start end dest d-start) 81 (declare (type ,src-type src) 82 (type ,dest-type dest) 83 (fixnum start end d-start)) 84 (macrolet 85 ((do-encoding (index) 86 `(let ((u1 0) (u2 0)) 87 (cond 88 ((<= +gbk2-offset+ ,index (- +gbk3-offset+ 1)) ; gbk/2 89 (setf u1 (+ #xB0 (truncate (/ ,index 94)))) 90 (setf u2 (+ #xA1 (mod ,index 94)))) 91 ((<= +gbk3-offset+ ,index (- +gbk4-offset+ 1)) ; gbk/3 92 (setf index (- ,index +gbk3-offset+)) 93 (setf u1 (+ #x81 (truncate (/ ,index 190)))) 94 (setf u2 (+ #x40 (mod ,index 190))) 95 (if (>= u2 #x7F) (incf u2))) 96 ((<= +gbk4-offset+ ,index (- +gbk1-offset+ 1)) ; gbk/4 97 (setf index (- ,index +gbk4-offset+)) 98 (setf u1 (+ #xAA (truncate (/ ,index 96)))) 99 (setf u2 (+ #x40 (mod ,index 96))) 100 (if (>= u2 #x7F) (incf u2))) 101 ((<= +gbk1-offset+ ,index (- +gbk5-offset+ 1)) ; gbk/1 102 (setf index (- ,index +gbk1-offset+)) 103 (setf u1 (+ #xA1 (truncate (/ ,index 94)))) 104 (setf u2 (+ #xA1 (mod ,index 94)))) 105 ((<= +gbk5-offset+ ,index (length *gbk-unicode-mapping*)) ; gbk/5 106 (setf index (- ,index +gbk5-offset+)) 107 (setf u1 (+ #xA8 (truncate (/ ,index 96)))) 108 (setf u2 (+ #x40 (mod ,index 96))) 109 (if (>= u2 #x7F) (incf u2)))) 110 (values u1 u2)))) 111 (let ((c 0) index (noctets 0)) 112 (loop for i from start below end 113 for code of-type code-point = (,getter src i) 114 do (macrolet 115 ((handle-error (&optional (c 'character-encoding-error)) 116 `(encoding-error code :gbk src i +repl+ ',c))) 117 (setf c (code-char code)) 118 (cond 119 ((< code #x80) ; ascii 120 (,setter code dest noctets) 121 (incf noctets)) 122 (t ; gbk 123 (setf index 124 (position c *gbk-unicode-mapping*)) 125 126 (if (not index) 127 (handle-error invalid-gbk-character)) 128 (multiple-value-bind (uh ul) (do-encoding index) 129 (,setter uh dest noctets) 130 (,setter ul dest (+ 1 noctets)) 131 (incf noctets 2))))) 132 finally (return (the fixnum (- noctets d-start)))))))) 133 134 (define-decoder :gbk (getter src-type setter dest-type) 135 `(lambda (src start end dest d-start) 136 (declare (type ,src-type src) 137 (type ,dest-type dest)) 138 (let ((u1 0) (u2 0) (index 0) (tmp 0) (noctets 0)) 139 (loop with i = start 140 while (< i end) 141 do (macrolet 142 ((handle-error (&optional (c 'character-decoding-error)) 143 `(decoding-error #(u1 u2) :gbk src i +repl+ ',c))) 144 (setf u1 (,getter src i)) 145 (incf i) 146 (cond 147 ((eq 0 (logand u1 #x80)) 148 (,setter u1 dest noctets)) 149 (t 150 (setf u2 (,getter src i)) 151 (incf i) 152 (setf index 153 (block setter-block 154 (cond 155 ((and (<= #xB0 u1 #xF7) (<= #xA1 u2 #xFE)) 156 (+ +gbk2-offset+ (+ (* 94 (- u1 #xB0)) (- u2 #xA1)))) 157 158 ((and (<= #x81 u1 #xA0) (<= #x40 u2 #xFE)) 159 (cond ((> u2 #x7F) (setf tmp 1)) 160 (t (setf tmp 0))) 161 (+ +gbk3-offset+ (* 190 (- u1 #x81)) (- u2 #x40 tmp))) 162 163 ((and (<= #xAA u1 #xFE) (<= #x40 #xA0)) 164 (cond ((> u2 #x7F) (setf tmp 1)) 165 (t (setf tmp 0))) 166 (+ +gbk4-offset+ (* 96 (- u1 #xAA)) (- u2 #x40 tmp))) 167 168 ((and (<= #xA1 u1 #xA9) (<= #xA1 u2 #xFE)) 169 (+ +gbk1-offset+ (* 94 (- u1 #xA1)) (- u2 #xA1))) 170 171 ((and (<= #xA8 u1 #xA9) (<= #x40 #xA0)) 172 (cond ((> u2 #x7F) (setf tmp 1)) 173 (t (setf tmp 0))) 174 (+ +gbk5-offset+ (* 96 (- u1 #xA8)) (- u2 #x40 tmp))) 175 (t 176 (handle-error invalid-gbk-byte))))) 177 178 (when (>= index (length *gbk-unicode-mapping*)) 179 (handle-error invalid-gbk-byte)) 180 (,setter (char-code 181 (elt *gbk-unicode-mapping* index)) 182 dest noctets))) 183 (incf noctets)) 184 finally (return (the fixnum (- noctets d-start)))))))