Date: Saturday, September 18, 2010 @ 22:37:11 Author: rtoy Path: /project/cmucl/cvsroot/src/code
Modified: unidata.lisp
o Construction of the Hangul syllable codebook was wrong. To satisfy the constraints on the codebook, we just sort them in descreasing order of length. o In %MIP, it might happen that MISMATCH returns NIL, which means a match. In this case, don't change the position.
--------------+ unidata.lisp | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-)
Index: src/code/unidata.lisp diff -u src/code/unidata.lisp:1.17 src/code/unidata.lisp:1.18 --- src/code/unidata.lisp:1.17 Sat Sep 18 17:38:10 2010 +++ src/code/unidata.lisp Sat Sep 18 22:37:10 2010 @@ -4,7 +4,7 @@ ;;; This code was written by Paul Foley and has been placed in the public ;;; domain. ;;; -(ext:file-comment "$Header: /project/cmucl/cvsroot/src/code/unidata.lisp,v 1.17 2010-09-18 21:38:10 rtoy Exp $") +(ext:file-comment "$Header: /project/cmucl/cvsroot/src/code/unidata.lisp,v 1.18 2010-09-19 02:37:10 rtoy Exp $") ;;; ;;; ********************************************************************** ;;; @@ -18,7 +18,7 @@
(defconstant +unidata-path+ #p"ext-formats:unidata.bin")
-(defvar *unidata-version* "$Revision: 1.17 $") +(defvar *unidata-version* "$Revision: 1.18 $")
(defstruct unidata range @@ -1304,7 +1304,7 @@ (let* ((first (first strings)) (posn (length first))) (dolist (string (rest strings)) - (setq posn (mismatch first string :end1 posn))) + (setq posn (or (mismatch first string :end1 posn) posn))) (subseq first 0 posn)))
(defun node-next (i &optional (dict (unidata-name+ *unicode-data*))) @@ -1334,15 +1334,23 @@ (defun build-hangul-syllable-dictionary () "Build the dictionary for Hangul syllables" (format t "~&Building Hangul Syllable dictionary. Please wait...~%") + (force-output) (initialize-reverse-hangul-tables) (let ((hangul-codebook - (map 'vector #'car - (delete "" - (concatenate 'vector - *reverse-hangul-choseong* - *reverse-hangul-jungseong* - *reverse-hangul-jongseong*) - :test #'string= :key #'car))) + ;; For our codebook, combine all the choseong, jungseong, and + ;; jonseong syllables, but removing empty strings (there's at + ;; least one). Then sort these according to length. This + ;; ensures that if A is an initial substring of B, then B + ;; must come before A (or A will never be used). (See + ;; tools/build-unidata.lisp, *codebook*.) + (sort (map 'vector #'car + (delete "" + (concatenate 'vector + *reverse-hangul-choseong* + *reverse-hangul-jungseong* + *reverse-hangul-jongseong*) + :test #'string= :key #'car)) + #'> :key #'length)) (names (loop for codepoint from 0 below codepoint-limit when (hangul-syllable-p codepoint) @@ -1352,13 +1360,15 @@ codepoint))))
(setf *hangul-syllable-dictionary* - (build-dictionary hangul-codebook (nreverse names))) + (build-dictionary hangul-codebook names)) (format t "~&Done.~%") + (force-output) (values)))
(defun build-cjk-unified-ideograph-dictionary () "Build the dictionary for CJK Unified Ideographs" (format t "~&Building CJK Unified Ideographs dictionary. Please wait...~%") + (force-output) (let ((codebook (coerce (loop for k from 0 to 15 collect (format nil "~X" k)) 'vector)) @@ -1369,6 +1379,7 @@ (setf *cjk-unified-ideograph-dictionary* (build-dictionary codebook names)) (format t "~&Done.~%") + (force-output) (values)))
;; The definitions of BUILD-DICTIONARY, NAME-LOOKUP, and ENCODE-NAME