Raymond Toy pushed to branch issue-120-software-type-in-c at cmucl / cmucl
Commits: 5dc0d7bf by Raymond Toy at 2023-03-24T14:31:40+00:00 Address #158: Filename encoding for Darwin
- - - - - 3578e015 by Raymond Toy at 2023-03-24T14:31:42+00:00 Merge branch 'issue-158-darwin-pathnames-utf8' into 'master'
Address #158: Filename encoding for Darwin
Closes #166 and #159
See merge request cmucl/cmucl!113 - - - - - ce823be4 by Raymond Toy at 2023-03-24T08:17:30-07:00 Update release notes with recently closed issues
We left out a few issues that probably aren't relevant to users like #175 and #170. We also added #158 which isn't closed, but explains we support utf-8 on Darwin.
- - - - - b9b145ab by Raymond Toy at 2023-03-24T08:23:42-07:00 Add period at the end of each bug item
- - - - - a06dad9b by Raymond Toy at 2023-03-25T07:48:32-07:00 Merge branch 'master' into issue-120-software-type-in-c
- - - - -
8 changed files:
- src/code/pathname.lisp - src/code/save.lisp - src/code/string.lisp - src/code/unicode.lisp - src/general-info/release-21e.md - src/i18n/locale/cmucl.pot - tests/issues.lisp - + tests/resources/darwin/안녕하십니까.txt
Changes:
===================================== src/code/pathname.lisp ===================================== @@ -252,6 +252,14 @@ ;;; This constructor is used to make an instance of the correct type ;;; from parsed arguments.
+#+darwin +(defvar *enable-darwin-path-normalization* nil + "When non-NIL, pathnames are on Darwin are normalized when created. + Otherwise, the pathnames are unchanged. + + This must be NIL during bootstrapping because Unicode is not yet + available.") + (defun %make-pathname-object (host device directory name type version) (if (typep host 'logical-host) (flet ((upcasify (thing) @@ -271,7 +279,30 @@ (upcasify name) (upcasify type) (upcasify version))) - (%make-pathname host device directory name type version))) + #-darwin + (%make-pathname host device directory name type version) + #+darwin + (flet ((normalize-name (piece) + ;; Normalize Darwin pathnames by converting Hangul + ;; syllables to conjoining jamo, and converting the + ;; string to NFD form, but skipping over a range of + ;; characters. + (typecase piece + (string + (if *enable-darwin-path-normalization* + (decompose (unicode::decompose-hangul piece) + :compatibility nil + :darwinp t) + piece)) + (t + ;; What should we do about lisp::pattern objects + ;; that occur in the name component? + piece)))) + (%make-pathname host device + (mapcar #'normalize-name directory) + (normalize-name name) + (normalize-name type) + version))))
;;; *LOGICAL-HOSTS* --internal. ;;;
===================================== src/code/save.lisp ===================================== @@ -202,7 +202,7 @@ (site-init "library:site-init") (print-herald t) (process-command-line t) - #+:executable + #+:executable (executable nil) (batch-mode nil) (quiet nil))
===================================== src/code/string.lisp ===================================== @@ -1097,7 +1097,10 @@ #+unicode (progn -(defun decompose (string &optional (compatibility t)) +(defun decompose (string &key (compatibility t) (start 0) end darwinp) + "Convert STRING to NFD (or NFKD). If :darwinp is non-NIL, then + characters in the ranges U2000-U2FFF, UF900-UFA6A, and U2F800-U2FA1D + are not decomposed, as specified for Darwin pathnames." (declare (type string string)) (let ((result (make-string (cond ((< (length string) 40) (* 5 (length string))) @@ -1113,8 +1116,13 @@ (declare (type kernel:index i)) (multiple-value-bind (code wide) (codepoint string i) (when wide (incf i)) - (let ((decomp (unicode-decomp code compatibility))) - (if decomp (rec decomp 0 (length decomp)) (out code)))))) + (if (and darwinp + (or (<= #x2000 code #x2fff) + (<= #xf900 code #xfa6a) + (<= #x2f800 code #x2fa1d))) + (out code) + (let ((decomp (unicode-decomp code compatibility))) + (if decomp (rec decomp 0 (length decomp)) (out code))))))) (out (code) (multiple-value-bind (hi lo) (surrogates code) (outch hi) @@ -1151,7 +1159,7 @@ (schar result (1+ last))))) (decf last (if wide2 2 1))) (t (return)))))))) - (with-string string + (with-one-string string start end offset-var (rec string start end)) (shrink-vector result fillptr))))
@@ -1251,12 +1259,12 @@ (defun string-to-nfd (string) _N"Convert String to Unicode Normalization Form D (NFD) using the canonical decomposition. The NFD string is returned" - (decompose string nil)) + (decompose string :compatibility nil))
(defun string-to-nfkd (string) _N"Convert String to Unicode Normalization Form KD (NFKD) uisng the compatible decomposition form. The NFKD string is returned." - (decompose string t)) + (decompose string :compatibility t))
(defun string-to-nfc (string) _N"Convert String to Unicode Normalization Form C (NFC). If the
===================================== src/code/unicode.lisp ===================================== @@ -517,3 +517,55 @@ (if (eq casing :simple) (cl:string-capitalize string :start start :end end) (string-capitalize-full string :start start :end end :casing casing)))) + + +(defun decompose-hangul-syllable (cp stream) + "Decompose the Hangul syllable codepoint CP to an equivalent sequence + of conjoining jamo and print the decomposed result to the stream + STREAM." + (let* ((s-base #xac00) + (l-base #x1100) + (v-base #x1161) + (t-base #x11a7) + (v-count 21) + (t-count 28) + (n-count (* v-count t-count))) + ;; Step 1: Compute index of the syllable S + (let ((s-index (- cp s-base))) + ;; Step 2: If s is in the range 0 <= s <= s-count, the compute + ;; the components. + (let ((l (+ l-base (truncate s-index n-count))) + (v (+ v-base (truncate (mod s-index n-count) t-count))) + (tt (+ t-base (mod s-index t-count)))) + ;; Step 3: If tt = t-base, then there is no trailing character + ;; so replace s by the sequence <l,v>. Otherwise there is a + ;; trailing character, so replace s by the sequence <l,v,tt>. + (princ (code-char l) stream) + (princ (code-char v) stream) + (unless (= tt t-base) + (princ (code-char tt) stream))))) + (values)) + +(defun is-hangul-syllable (codepoint) + "Test if CODEPOINT is a Hangul syllable" + (let* ((s-base #xac00) + (l-count 19) + (v-count 21) + (t-count 28) + (n-count (* v-count t-count)) + (number-of-syllables (* l-count n-count))) + (<= 0 (- codepoint s-base) number-of-syllables))) + +(defun decompose-hangul (string) + "Decompose any Hangul syllables in STRING to an equivalent sequence of + conjoining jamo characters." + (with-output-to-string (s) + (loop for cp being the codepoints of string + do + (if (is-hangul-syllable cp) + (decompose-hangul-syllable cp s) + (multiple-value-bind (high low) + (surrogates cp) + (princ high s) + (when low + (princ low s)))))))
===================================== src/general-info/release-21e.md ===================================== @@ -22,63 +22,68 @@ public domain. * Feature enhancements * Changes * Update to ASDF 3.3.6 - * The default external format is `:utf-8` instead of `:iso8859-1` + * The default external format is `:utf-8` instead of `:iso8859-1`. * ANSI compliance fixes: * Bug fixes: * ~~#97~~ Fixes stepping through the source forms in the debugger. This has been broken for quite some time, but it works now.
* Gitlab tickets: - * ~~#68~~ gcc8.1.1 can't build lisp. Change optimization from `-O2` to `-O1` - * ~~#72~~ CMU user manual now part of cmucl-site - * ~~#73~~ Update clx from upstream clx - * ~~#77~~ Added tests for sqrt for exceptional values + * ~~#68~~ gcc8.1.1 can't build lisp. Change optimization from `-O2` to `-O1`. + * ~~#72~~ CMU user manual now part of cmucl-site. + * ~~#73~~ Update clx from upstream clx. + * ~~#77~~ Added tests for sqrt for exceptional values. * ~~#79~~ Autoload ASDF when calling `REQUIRE` the first time. User's no longer have to explicitly load ASDF anymore. * ~~#80~~ Use ASDF to load contribs. cmu-contribs still exists but does nothing. The contrib names are the same, except it's best to use a keyword instead of a string. So, `:contrib-demos` instead of `"contrib-demos"`. - * ~~#81~~ Added contribs from Eric Marsden - * ~~#82~~ Replace bc with expr in GNUMakefile - * ~~#86~~ Building with gcc 8 and later works when using -O2 optimization + * ~~#81~~ Added contribs from Eric Marsden. + * ~~#82~~ Replace bc with expr in GNUMakefile. + * ~~#86~~ Building with gcc 8 and later works when using -O2 optimization. * ~~#90~~ Some static symbols have been removed. This probably makes the fasl files incompatible with older versions. - * ~~#91~~ Loop destructuring no longer incorrectly signals an error - * ~~#95~~ Disassembler syntax of x86 je and movzx is incorrect + * ~~#91~~ Loop destructuring no longer incorrectly signals an error. + * ~~#95~~ Disassembler syntax of x86 je and movzx is incorrect. * ~~#97~~ Define and use ud2 instruction instead of int3. Fixes single-stepping. - * ~~#98~~ fstpd is not an Intel instruction; disassemble as `fstp dword ptr [addr]` + * ~~#98~~ fstpd is not an Intel instruction; disassemble as `fstp dword ptr [addr]`. * ~~#100~~ ldb prints out Unicode base-chars correctly instead of just the low 8 bits. - * ~~#103~~ RANDOM-MT19937-UPDATE assembly routine still exists + * ~~#103~~ RANDOM-MT19937-UPDATE assembly routine still exists. * ~~#104~~ Single-stepping broken (fixed via #97). - * ~~#107~~ Replace u_int8_t with uint8_t - * ~~#108~~ Update ASDF - * ~~#112~~ CLX can't connect to X server via inet sockets + * ~~#107~~ Replace u_int8_t with uint8_t. + * ~~#108~~ Update ASDF. + * ~~#112~~ CLX can't connect to X server via inet sockets. * ~~#113~~ REQUIRE on contribs can pull in the wrong things via ASDF.. - * ~~#120~~ `SOFTWARE-TYPE` and `SOFTWARE-VERSION` are implemented in C. + * ~~#120~~ `SOFTWARE-VERSION` is implemented in C. * ~~#121~~ Wrong column index in FILL-POINTER-OUTPUT-STREAM * ~~#122~~ gcc 11 can't build cmucl * ~~#124~~ directory with `:wild-inferiors` doesn't descend subdirectories * ~~#125~~ Linux `unix-stat` returning incorrect values * ~~#127~~ Linux unix-getpwuid segfaults when given non-existent uid.. - * ~~#128~~ `QUIT` accepts an exit code - * ~~#130~~ Move file-author to C - * ~~#132~~ Ansi test `RENAME-FILE.1` no longer fails - * ~~#134~~ Handle the case of `(expt complex complex-rational)` - * ~~#136~~ `ensure-directories-exist` should return the given pathspec - * #139 `*default-external-format*` defaults to `:utf-8`; add alias for `:locale` external format - * ~~#140~~ External format for streams that are not `file-stream`'s - * ~~#141~~ Disallow locales that are pathnames to a localedef file - * ~~#142~~ `(random 0)` signals incorrect error - * ~~#147~~ `stream-line-column` method missing for `fundamental-character-output-stream` - * ~~#149~~ Call setlocale(3C) on startup - * ~~#150~~ Add aliases for external format cp949 and euckr + * ~~#128~~ `QUIT` accepts an exit code. + * ~~#130~~ Move file-author to C. + * ~~#132~~ Ansi test `RENAME-FILE.1` no longer fails. + * ~~#134~~ Handle the case of `(expt complex complex-rational)`. + * ~~#136~~ `ensure-directories-exist` should return the given pathspec. + * #139 `*default-external-format*` defaults to `:utf-8`; add alias for `:locale` external format. + * ~~#140~~ External format for streams that are not `file-stream`'s. + * ~~#141~~ Disallow locales that are pathnames to a localedef file. + * ~~#142~~ `(random 0)` signals incorrect error. + * ~~#147~~ `stream-line-column` method missing for `fundamental-character-output-stream`. + * ~~#149~~ Call setlocale(3C) on startup. + * ~~#150~~ Add aliases for external format cp949 and euckr. * ~~#151~~ Change `*default-external-format*` to `:utf-8`. - * ~~#155~~ Wrap help strings neatly - * ~~#157~~ `(directory "foo/**/")` only returns directories now - * ~~#163~~ Add command-line option `-version` and `--version` to get lisp version - * ~~#165~~ Avoid inserting NIL into simple `LOOP` from `FORMAT` - * ~~#166~~ Fix incorrect type declaration for exponent from `integer-decode-float` + * ~~#152~~ Add new external format, `:locale` as an alias to the codeset from LANG and friends. + * ~~#!53~~ Terminals default to an encoding of `:locale`. + * ~~#155~~ Wrap help strings neatly. + * ~~#157~~ `(directory "foo/**/")` only returns directories now. + * #158 Darwin uses utf-8, but we don't support all the rules for pathnames. + * ~~#162~~ `*filename-encoding*` defaults to `:null` to mean no encoding. + * ~~#163~~ Add command-line option `-version` and `--version` to get lisp version. + * ~~#165~~ Avoid inserting NIL into simple `LOOP` from `FORMAT`. + * ~~#166~~ Fix incorrect type declaration for exponent from `integer-decode-float`. * ~~#167~~ Low bound for `decode-float-exponent` type was off by one.. - * ~~#168~~ Don't use negated forms for jmp instructions when possible - * ~~#169~~ Add pprinter for `define-vop` and `sc-case` + * ~~#168~~ Don't use negated forms for jmp instructions when possible. + * ~~#169~~ Add pprinter for `define-vop` and `sc-case`. * ~~#172~~ Declare `pathname-match-p` as returning `nil` or `pathname`. - * ~~#173~~ Add pprinter for `define-assembly-routine` + * ~~#173~~ Add pprinter for `define-assembly-routine`. * ~~#176~~ `SHORT-SITE-NAME` and `LONG-SITE-NAME` return `NIL`. + * ~~#177~~ Add pprinter for `deftransform` and `defoptimizer`. * Other changes: * Improvements to the PCL implementation of CLOS: * Changes to building procedure:
===================================== src/i18n/locale/cmucl.pot ===================================== @@ -4012,6 +4012,14 @@ msgid "" " string is returned." msgstr ""
+#: src/code/string.lisp +msgid "" +"Convert String to NFD (or NFKD). If :darwinp is non-NIL, then\n" +" characters in the ranges U2000-U2FFF, UF900-UFA6A, and\n" +" U2F800-U2FA1D are not decomposed, as specified for Darwin\n" +" pathnames." +msgstr "" + #: src/code/string.lisp msgid "" "Convert a sequence of codepoints to a string. Codepoints outside\n" @@ -15267,6 +15275,23 @@ msgid "" " delimited by non-case-modifiable chars. " msgstr ""
+#: src/code/unicode.lisp +msgid "" +"Decompose the Hangul syllable codepoint CP to an equivalent sequence\n" +" of conjoining jamo and print the decomposed result to the stream\n" +" STREAM." +msgstr "" + +#: src/code/unicode.lisp +msgid "Test if CODEPOINT is a Hangul syllable" +msgstr "" + +#: src/code/unicode.lisp +msgid "" +"Decompose any Hangul syllables in STRING to an equivalent sequence of\n" +" conjoining jamo characters." +msgstr "" + #: src/compiler/macros.lisp msgid "" "Policy Node Condition*\n"
===================================== tests/issues.lisp ===================================== @@ -832,6 +832,54 @@
+(define-test issue.158 + (:tag :issues) + (let* ((name (string #\Hangul_Syllable_Gyek)) + (path (make-pathname :directory (list :relative name) + :name name + :type name))) + ;; Enable this when we implement normalization for Darwin + #+(and nil darwin) + (let ((expected '(4352 4456 4543))) + ;; Tests that on Darwin the Hangul pathname has been normalized + ;; correctly. We fill in the directory, name, and type components + ;; with the same thing since it shouldn't really matter. + ;; + ;; The expected value is the conjoining jamo for the character + ;; #\Hangul_Syllable_Gyek. + (assert-equal (map 'list #'char-code (second (pathname-directory path))) + expected) + (assert-equal (map 'list #'char-code (pathname-name path)) + expected) + (assert-equal (map 'list #'char-code (pathname-type path)) + expected)) + #-darwin + (let ((expected (list (char-code #\Hangul_Syllable_Gyek)))) + ;; For other OSes, just assume that the pathname is unchanged. + (assert-equal (map 'list #'char-code (second (pathname-directory path))) + expected) + (assert-equal (map 'list #'char-code (pathname-name path)) + expected) + (assert-equal (map 'list #'char-code (pathname-type path)) + expected)))) + +(define-test issue.158.dir + (:tag :issues) + (flet ((get-file () + ;; This assumes that there is only one file in resources/darwin + (let ((files (directory (merge-pathnames "resources/darwin/*.txt" *test-path*)))) + (assert-equal (length files) 1) + (first files)))) + (let ((f (get-file)) + (expected-name "안녕하십니까")) + #+darwin + (assert-equal (pathname-name f) + (unicode::decompose-hangul expected-name)) + #-darwin + (assert-equal (pathname-name f) expected-name)))) + + + (define-test issue.166 (:tag :issues) ;; While this tests for the correct return value, the problem was @@ -896,4 +944,3 @@ (assert-true (typep idf-max-expo 'kernel:double-float-int-exponent)) (assert-true (typep (1- idf-max-expo) 'kernel:double-float-int-exponent)) (assert-false (typep (1+ idf-max-expo) 'kernel:double-float-int-exponent)))) -
===================================== tests/resources/darwin/안녕하십니까.txt ===================================== @@ -0,0 +1,3 @@ +The file name of this file is "안녕하십니까.txt" ("Hello" in Korean.) + +
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/531ea53c4501269b59aa81e...