Raymond Toy pushed to branch master at cmucl / cmucl
Commits: 5dc0d7bf by Raymond Toy at 2023-03-24T14:31:40+00:00 Address #158: Filename encoding for Darwin
- - - - - 3578e015 by Raymond Toy at 2023-03-24T14:31:42+00:00 Merge branch 'issue-158-darwin-pathnames-utf8' into 'master'
Address #158: Filename encoding for Darwin
Closes #166 and #159
See merge request cmucl/cmucl!113 - - - - -
7 changed files:
- src/code/pathname.lisp - src/code/save.lisp - src/code/string.lisp - src/code/unicode.lisp - src/i18n/locale/cmucl.pot - tests/issues.lisp - + tests/resources/darwin/안녕하십니까.txt
Changes:
===================================== src/code/pathname.lisp ===================================== @@ -252,6 +252,14 @@ ;;; This constructor is used to make an instance of the correct type ;;; from parsed arguments.
+#+darwin +(defvar *enable-darwin-path-normalization* nil + "When non-NIL, pathnames are on Darwin are normalized when created. + Otherwise, the pathnames are unchanged. + + This must be NIL during bootstrapping because Unicode is not yet + available.") + (defun %make-pathname-object (host device directory name type version) (if (typep host 'logical-host) (flet ((upcasify (thing) @@ -271,7 +279,30 @@ (upcasify name) (upcasify type) (upcasify version))) - (%make-pathname host device directory name type version))) + #-darwin + (%make-pathname host device directory name type version) + #+darwin + (flet ((normalize-name (piece) + ;; Normalize Darwin pathnames by converting Hangul + ;; syllables to conjoining jamo, and converting the + ;; string to NFD form, but skipping over a range of + ;; characters. + (typecase piece + (string + (if *enable-darwin-path-normalization* + (decompose (unicode::decompose-hangul piece) + :compatibility nil + :darwinp t) + piece)) + (t + ;; What should we do about lisp::pattern objects + ;; that occur in the name component? + piece)))) + (%make-pathname host device + (mapcar #'normalize-name directory) + (normalize-name name) + (normalize-name type) + version))))
;;; *LOGICAL-HOSTS* --internal. ;;;
===================================== src/code/save.lisp ===================================== @@ -202,7 +202,7 @@ (site-init "library:site-init") (print-herald t) (process-command-line t) - #+:executable + #+:executable (executable nil) (batch-mode nil) (quiet nil))
===================================== src/code/string.lisp ===================================== @@ -1097,7 +1097,10 @@ #+unicode (progn -(defun decompose (string &optional (compatibility t)) +(defun decompose (string &key (compatibility t) (start 0) end darwinp) + "Convert STRING to NFD (or NFKD). If :darwinp is non-NIL, then + characters in the ranges U2000-U2FFF, UF900-UFA6A, and U2F800-U2FA1D + are not decomposed, as specified for Darwin pathnames." (declare (type string string)) (let ((result (make-string (cond ((< (length string) 40) (* 5 (length string))) @@ -1113,8 +1116,13 @@ (declare (type kernel:index i)) (multiple-value-bind (code wide) (codepoint string i) (when wide (incf i)) - (let ((decomp (unicode-decomp code compatibility))) - (if decomp (rec decomp 0 (length decomp)) (out code)))))) + (if (and darwinp + (or (<= #x2000 code #x2fff) + (<= #xf900 code #xfa6a) + (<= #x2f800 code #x2fa1d))) + (out code) + (let ((decomp (unicode-decomp code compatibility))) + (if decomp (rec decomp 0 (length decomp)) (out code))))))) (out (code) (multiple-value-bind (hi lo) (surrogates code) (outch hi) @@ -1151,7 +1159,7 @@ (schar result (1+ last))))) (decf last (if wide2 2 1))) (t (return)))))))) - (with-string string + (with-one-string string start end offset-var (rec string start end)) (shrink-vector result fillptr))))
@@ -1251,12 +1259,12 @@ (defun string-to-nfd (string) _N"Convert String to Unicode Normalization Form D (NFD) using the canonical decomposition. The NFD string is returned" - (decompose string nil)) + (decompose string :compatibility nil))
(defun string-to-nfkd (string) _N"Convert String to Unicode Normalization Form KD (NFKD) uisng the compatible decomposition form. The NFKD string is returned." - (decompose string t)) + (decompose string :compatibility t))
(defun string-to-nfc (string) _N"Convert String to Unicode Normalization Form C (NFC). If the
===================================== src/code/unicode.lisp ===================================== @@ -517,3 +517,55 @@ (if (eq casing :simple) (cl:string-capitalize string :start start :end end) (string-capitalize-full string :start start :end end :casing casing)))) + + +(defun decompose-hangul-syllable (cp stream) + "Decompose the Hangul syllable codepoint CP to an equivalent sequence + of conjoining jamo and print the decomposed result to the stream + STREAM." + (let* ((s-base #xac00) + (l-base #x1100) + (v-base #x1161) + (t-base #x11a7) + (v-count 21) + (t-count 28) + (n-count (* v-count t-count))) + ;; Step 1: Compute index of the syllable S + (let ((s-index (- cp s-base))) + ;; Step 2: If s is in the range 0 <= s <= s-count, the compute + ;; the components. + (let ((l (+ l-base (truncate s-index n-count))) + (v (+ v-base (truncate (mod s-index n-count) t-count))) + (tt (+ t-base (mod s-index t-count)))) + ;; Step 3: If tt = t-base, then there is no trailing character + ;; so replace s by the sequence <l,v>. Otherwise there is a + ;; trailing character, so replace s by the sequence <l,v,tt>. + (princ (code-char l) stream) + (princ (code-char v) stream) + (unless (= tt t-base) + (princ (code-char tt) stream))))) + (values)) + +(defun is-hangul-syllable (codepoint) + "Test if CODEPOINT is a Hangul syllable" + (let* ((s-base #xac00) + (l-count 19) + (v-count 21) + (t-count 28) + (n-count (* v-count t-count)) + (number-of-syllables (* l-count n-count))) + (<= 0 (- codepoint s-base) number-of-syllables))) + +(defun decompose-hangul (string) + "Decompose any Hangul syllables in STRING to an equivalent sequence of + conjoining jamo characters." + (with-output-to-string (s) + (loop for cp being the codepoints of string + do + (if (is-hangul-syllable cp) + (decompose-hangul-syllable cp s) + (multiple-value-bind (high low) + (surrogates cp) + (princ high s) + (when low + (princ low s)))))))
===================================== src/i18n/locale/cmucl.pot ===================================== @@ -4012,6 +4012,14 @@ msgid "" " string is returned." msgstr ""
+#: src/code/string.lisp +msgid "" +"Convert String to NFD (or NFKD). If :darwinp is non-NIL, then\n" +" characters in the ranges U2000-U2FFF, UF900-UFA6A, and\n" +" U2F800-U2FA1D are not decomposed, as specified for Darwin\n" +" pathnames." +msgstr "" + #: src/code/string.lisp msgid "" "Convert a sequence of codepoints to a string. Codepoints outside\n" @@ -15271,6 +15279,23 @@ msgid "" " delimited by non-case-modifiable chars. " msgstr ""
+#: src/code/unicode.lisp +msgid "" +"Decompose the Hangul syllable codepoint CP to an equivalent sequence\n" +" of conjoining jamo and print the decomposed result to the stream\n" +" STREAM." +msgstr "" + +#: src/code/unicode.lisp +msgid "Test if CODEPOINT is a Hangul syllable" +msgstr "" + +#: src/code/unicode.lisp +msgid "" +"Decompose any Hangul syllables in STRING to an equivalent sequence of\n" +" conjoining jamo characters." +msgstr "" + #: src/compiler/macros.lisp msgid "" "Policy Node Condition*\n"
===================================== tests/issues.lisp ===================================== @@ -832,6 +832,54 @@
+(define-test issue.158 + (:tag :issues) + (let* ((name (string #\Hangul_Syllable_Gyek)) + (path (make-pathname :directory (list :relative name) + :name name + :type name))) + ;; Enable this when we implement normalization for Darwin + #+(and nil darwin) + (let ((expected '(4352 4456 4543))) + ;; Tests that on Darwin the Hangul pathname has been normalized + ;; correctly. We fill in the directory, name, and type components + ;; with the same thing since it shouldn't really matter. + ;; + ;; The expected value is the conjoining jamo for the character + ;; #\Hangul_Syllable_Gyek. + (assert-equal (map 'list #'char-code (second (pathname-directory path))) + expected) + (assert-equal (map 'list #'char-code (pathname-name path)) + expected) + (assert-equal (map 'list #'char-code (pathname-type path)) + expected)) + #-darwin + (let ((expected (list (char-code #\Hangul_Syllable_Gyek)))) + ;; For other OSes, just assume that the pathname is unchanged. + (assert-equal (map 'list #'char-code (second (pathname-directory path))) + expected) + (assert-equal (map 'list #'char-code (pathname-name path)) + expected) + (assert-equal (map 'list #'char-code (pathname-type path)) + expected)))) + +(define-test issue.158.dir + (:tag :issues) + (flet ((get-file () + ;; This assumes that there is only one file in resources/darwin + (let ((files (directory (merge-pathnames "resources/darwin/*.txt" *test-path*)))) + (assert-equal (length files) 1) + (first files)))) + (let ((f (get-file)) + (expected-name "안녕하십니까")) + #+darwin + (assert-equal (pathname-name f) + (unicode::decompose-hangul expected-name)) + #-darwin + (assert-equal (pathname-name f) expected-name)))) + + + (define-test issue.166 (:tag :issues) ;; While this tests for the correct return value, the problem was @@ -896,4 +944,3 @@ (assert-true (typep idf-max-expo 'kernel:double-float-int-exponent)) (assert-true (typep (1- idf-max-expo) 'kernel:double-float-int-exponent)) (assert-false (typep (1+ idf-max-expo) 'kernel:double-float-int-exponent)))) -
===================================== tests/resources/darwin/안녕하십니까.txt ===================================== @@ -0,0 +1,3 @@ +The file name of this file is "안녕하십니까.txt" ("Hello" in Korean.) + +
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/c26f8ede5b0828a2db337fc...