Date: Saturday, September 18, 2010 @ 16:47:51 Author: rtoy Path: /project/cmucl/cvsroot/src
Modified: code/unidata.lisp tools/build-unidata.lisp
code/unidata.lisp: o Just add some comments on why we don't put the dictionaries in unidata.bin. o Print out some messages when building the hangul and cjk dictionaries so the user knows what's happening.
tools/build-unidata.lisp: o Add some comments on the various parts of unidata.bin.
--------------------------+ code/unidata.lisp | 25 +++++++++++++++++++++---- tools/build-unidata.lisp | 41 +++++++++++++++++++++++------------------ 2 files changed, 44 insertions(+), 22 deletions(-)
Index: src/code/unidata.lisp diff -u src/code/unidata.lisp:1.14 src/code/unidata.lisp:1.15 --- src/code/unidata.lisp:1.14 Fri Sep 17 19:29:01 2010 +++ src/code/unidata.lisp Sat Sep 18 16:47:51 2010 @@ -4,7 +4,7 @@ ;;; This code was written by Paul Foley and has been placed in the public ;;; domain. ;;; -(ext:file-comment "$Header: /project/cmucl/cvsroot/src/code/unidata.lisp,v 1.14 2010-09-17 23:29:01 rtoy Exp $") +(ext:file-comment "$Header: /project/cmucl/cvsroot/src/code/unidata.lisp,v 1.15 2010-09-18 20:47:51 rtoy Exp $") ;;; ;;; ********************************************************************** ;;; @@ -18,7 +18,7 @@
(defconstant +unidata-path+ #p"ext-formats:unidata.bin")
-(defvar *unidata-version* "$Revision: 1.14 $") +(defvar *unidata-version* "$Revision: 1.15 $")
(defstruct unidata range @@ -1116,6 +1116,17 @@ ;; Code written by Paul Foley, with some modifications by Raymond Toy. ;;
+;; These hold dictionaries for the Hangul syllables and the CJK +;; unified ideographs. Note that these could be stored in +;; unidata.bin, but that adds almost a megabyte to the size of +;; unidata.bin. That seems way to much bloat for something that is +;; probably not used that much. However, this incurs a runtime cost +;; the first time it needs to be accessed. On a 450 MHz sparc, it +;; takes 55 sec for the cjk dictionary and 9 sec for the Hangul +;; dictionary. A bit long but not too bad. On a 2 GHz mac mini, it +;; takes 5 sec and .8 sec, respectively. This seems reasonable, +;; especially since the intent is for character completion, which +;; doesn't have to be too fast. (defvar *hangul-syllable-dictionary* nil "Dictionary of Hangul syllables") (defvar *cjk-unified-ideograph-dictionary* nil @@ -1322,6 +1333,7 @@ (defun build-hangul-syllable-dictionary () "Build the dictionary for Hangul syllables" + (format t "~&Building Hangul Syllable dictionary. Please wait...~%") (initialize-reverse-hangul-tables) (let ((hangul-codebook (map 'vector #'car @@ -1344,10 +1356,13 @@ names) (incf k)))) (setf *hangul-syllable-dictionary* - (build-dictionary hangul-codebook (nreverse names))))) + (build-dictionary hangul-codebook (nreverse names))) + (format t "~&Done.~%") + (values)))
(defun build-cjk-unified-ideograph-dictionary () "Build the dictionary for CJK Unified Ideographs" + (format t "~&Building CJK Unified Ideographs dictionary. Please wait...~%") (let ((codebook (coerce (loop for k from 0 to 15 collect (format nil "~X" k)) 'vector)) @@ -1356,7 +1371,9 @@ collect (cons (format nil "~X" codepoint) codepoint)))) (setf *cjk-unified-ideograph-dictionary* - (build-dictionary codebook names)))) + (build-dictionary codebook names)) + (format t "~&Done.~%") + (values)))
;; The definitions of BUILD-DICTIONARY, NAME-LOOKUP, and ENCODE-NAME ;; were taken from build-unidata.lisp. Index: src/tools/build-unidata.lisp diff -u src/tools/build-unidata.lisp:1.5 src/tools/build-unidata.lisp:1.6 --- src/tools/build-unidata.lisp:1.5 Wed Sep 15 17:06:39 2010 +++ src/tools/build-unidata.lisp Sat Sep 18 16:47:51 2010 @@ -4,7 +4,7 @@ ;;; This code was written by Paul Foley and has been placed in the public ;;; domain. ;;; -(ext:file-comment "$Header: /project/cmucl/cvsroot/src/tools/build-unidata.lisp,v 1.5 2010-09-15 21:06:39 rtoy Exp $") +(ext:file-comment "$Header: /project/cmucl/cvsroot/src/tools/build-unidata.lisp,v 1.6 2010-09-18 20:47:51 rtoy Exp $") ;;; ;;; ********************************************************************** ;;; @@ -466,6 +466,8 @@ (error "Index array too short for the data being written"))))) (with-open-file (stm path :direction :io :if-exists :rename-and-delete :element-type '(unsigned-byte 8)) + ;; The length of the index array is the number of sections to be + ;; written. See below for each section. (let ((index (make-array 19 :fill-pointer 0))) ;; File header (write32 +unicode-magic-number+ stm) ; identification "magic" @@ -478,12 +480,12 @@ (dotimes (i (array-dimension index 0)) (write32 0 stm)) ; space for indices (write32 0 stm) ; end marker - ;; Range data + ;; 0. Range data (let ((data (unidata-range *unicode-data*))) (update-index (file-position stm) index) (write32 (length (range-codes data)) stm) (write-vector (range-codes data) stm :endian-swap :network-order)) - ;; Character name data + ;; 1. Character name data (let ((data (unidata-name+ *unicode-data*))) (update-index (file-position stm) index) (write-byte (1- (length (dictionary-cdbk data))) stm) @@ -499,41 +501,41 @@ (write-vector (dictionary-codev data) stm :endian-swap :network-order) (write-vector (dictionary-nextv data) stm :endian-swap :network-order) (write-vector (dictionary-namev data) stm :endian-swap :network-order)) - ;; Codepoint-to-name mapping + ;; 2. Codepoint-to-name mapping (let ((data (unidata-name *unicode-data*))) (update-index (file-position stm) index) (write-ntrie32 data stm)) - ;; Codepoint-to-category table + ;; 3. Codepoint-to-category table (let ((data (unidata-category *unicode-data*))) (update-index (file-position stm) index) (write-ntrie8 data stm)) - ;; Simple case mapping table + ;; 4. Simple case mapping table (let ((data (unidata-scase *unicode-data*))) (update-index (file-position stm) index) (write-ntrie32 data stm) (write-byte (length (scase-svec data)) stm) (write-vector (scase-svec data) stm :endian-swap :network-order)) - ;; Numeric data + ;; 5. Numeric data (let ((data (unidata-numeric *unicode-data*))) (update-index (file-position stm) index) (write-ntrie32 data stm)) - ;; Decomposition data + ;; 6. Decomposition data (let ((data (unidata-decomp *unicode-data*))) (update-index (file-position stm) index) (write-ntrie32 data stm) (write16 (length (decomp-tabl data)) stm) (write-vector (decomp-tabl data) stm :endian-swap :network-order)) - ;; Combining classes + ;; 7. Combining classes (let ((data (unidata-combining *unicode-data*))) (update-index (file-position stm) index) (write-ntrie8 data stm)) - ;; Bidi data + ;; 8. Bidi data (let ((data (unidata-bidi *unicode-data*))) (update-index (file-position stm) index) (write-ntrie16 data stm) (write-byte (length (bidi-tabl data)) stm) (write-vector (bidi-tabl data) stm :endian-swap :network-order)) - ;; Unicode 1.0 names + ;; 9. Unicode 1.0 names (let ((data (unidata-name1+ *unicode-data*))) (update-index (file-position stm) index) (write-byte (1- (length (dictionary-cdbk data))) stm) @@ -549,10 +551,11 @@ (write-vector (dictionary-codev data) stm :endian-swap :network-order) (write-vector (dictionary-nextv data) stm :endian-swap :network-order) (write-vector (dictionary-namev data) stm :endian-swap :network-order)) + ;; 10. Codepoint to unicode-1.0 name (let ((data (unidata-name1 *unicode-data*))) (update-index (file-position stm) index) (write-ntrie32 data stm)) - ;; Normalization quick-check data + ;; 11. Normalization quick-check data (update-index (file-position stm) index) (let ((data (unidata-qc-nfd *unicode-data*))) (write-ntrie1 data stm)) @@ -562,35 +565,37 @@ (write-ntrie2 data stm)) (let ((data (unidata-qc-nfkc *unicode-data*))) (write-ntrie2 data stm)) - ;; Write composition exclusion table + ;; 12. Write composition exclusion table (let ((data (unidata-comp-exclusions *unicode-data*))) (update-index (file-position stm) index) (write16 (length data) stm) (write-vector data stm :endian-swap :network-order)) - ;; Write full-case lower data (flet ((dump-full-case (data) (update-index (file-position stm) index) (write-ntrie32 data stm) (write16 (length (full-case-tabl data)) stm) (write-vector (full-case-tabl data) stm :endian-swap :network-order))) + ;; 13. Write full-case lower data (dump-full-case (unidata-full-case-lower *unicode-data*)) + ;; 14. Write full-case title data (dump-full-case (unidata-full-case-title *unicode-data*)) + ;; 15. Write full-case upper data (dump-full-case (unidata-full-case-upper *unicode-data*))) - ;; Write case folding data + ;; 16. Write case-folding simple data (let ((data (unidata-case-fold-simple *unicode-data*))) (update-index (file-position stm) index) (write-ntrie32 data stm)) - ;; case-folding full + ;; 17. Write case-folding full data (let ((data (unidata-case-fold-full *unicode-data*))) (update-index (file-position stm) index) (write-ntrie32 data stm) (write16 (length (case-folding-tabl data)) stm) (write-vector (case-folding-tabl data) stm :endian-swap :network-order)) - ;; Word-break + ;; 18. Word-break (let ((data (unidata-word-break *unicode-data*))) (update-index (file-position stm) index) (write-ntrie4 data stm)) - ;; Patch up index + ;; All components saved. Patch up index table now. (file-position stm 8) (dotimes (i (length index)) (write32 (aref index i) stm)))))