Raymond Toy pushed to branch issue-120-software-type-in-c at cmucl / cmucl
Commits:
-
5dc0d7bf
by Raymond Toy at 2023-03-24T14:31:40+00:00
-
3578e015
by Raymond Toy at 2023-03-24T14:31:42+00:00
-
ce823be4
by Raymond Toy at 2023-03-24T08:17:30-07:00
-
b9b145ab
by Raymond Toy at 2023-03-24T08:23:42-07:00
-
a06dad9b
by Raymond Toy at 2023-03-25T07:48:32-07:00
8 changed files:
- src/code/pathname.lisp
- src/code/save.lisp
- src/code/string.lisp
- src/code/unicode.lisp
- src/general-info/release-21e.md
- src/i18n/locale/cmucl.pot
- tests/issues.lisp
- + tests/resources/darwin/안녕하십니까.txt
Changes:
... | ... | @@ -252,6 +252,14 @@ |
252 | 252 | ;;; This constructor is used to make an instance of the correct type
|
253 | 253 | ;;; from parsed arguments.
|
254 | 254 | |
255 | +#+darwin
|
|
256 | +(defvar *enable-darwin-path-normalization* nil
|
|
257 | + "When non-NIL, pathnames are on Darwin are normalized when created.
|
|
258 | + Otherwise, the pathnames are unchanged.
|
|
259 | + |
|
260 | + This must be NIL during bootstrapping because Unicode is not yet
|
|
261 | + available.")
|
|
262 | + |
|
255 | 263 | (defun %make-pathname-object (host device directory name type version)
|
256 | 264 | (if (typep host 'logical-host)
|
257 | 265 | (flet ((upcasify (thing)
|
... | ... | @@ -271,7 +279,30 @@ |
271 | 279 | (upcasify name)
|
272 | 280 | (upcasify type)
|
273 | 281 | (upcasify version)))
|
274 | - (%make-pathname host device directory name type version)))
|
|
282 | + #-darwin
|
|
283 | + (%make-pathname host device directory name type version)
|
|
284 | + #+darwin
|
|
285 | + (flet ((normalize-name (piece)
|
|
286 | + ;; Normalize Darwin pathnames by converting Hangul
|
|
287 | + ;; syllables to conjoining jamo, and converting the
|
|
288 | + ;; string to NFD form, but skipping over a range of
|
|
289 | + ;; characters.
|
|
290 | + (typecase piece
|
|
291 | + (string
|
|
292 | + (if *enable-darwin-path-normalization*
|
|
293 | + (decompose (unicode::decompose-hangul piece)
|
|
294 | + :compatibility nil
|
|
295 | + :darwinp t)
|
|
296 | + piece))
|
|
297 | + (t
|
|
298 | + ;; What should we do about lisp::pattern objects
|
|
299 | + ;; that occur in the name component?
|
|
300 | + piece))))
|
|
301 | + (%make-pathname host device
|
|
302 | + (mapcar #'normalize-name directory)
|
|
303 | + (normalize-name name)
|
|
304 | + (normalize-name type)
|
|
305 | + version))))
|
|
275 | 306 | |
276 | 307 | ;;; *LOGICAL-HOSTS* --internal.
|
277 | 308 | ;;;
|
... | ... | @@ -202,7 +202,7 @@ |
202 | 202 | (site-init "library:site-init")
|
203 | 203 | (print-herald t)
|
204 | 204 | (process-command-line t)
|
205 | - #+:executable
|
|
205 | + #+:executable
|
|
206 | 206 | (executable nil)
|
207 | 207 | (batch-mode nil)
|
208 | 208 | (quiet nil))
|
... | ... | @@ -1097,7 +1097,10 @@ |
1097 | 1097 | |
1098 | 1098 | #+unicode
|
1099 | 1099 | (progn
|
1100 | -(defun decompose (string &optional (compatibility t))
|
|
1100 | +(defun decompose (string &key (compatibility t) (start 0) end darwinp)
|
|
1101 | + "Convert STRING to NFD (or NFKD). If :darwinp is non-NIL, then
|
|
1102 | + characters in the ranges U2000-U2FFF, UF900-UFA6A, and U2F800-U2FA1D
|
|
1103 | + are not decomposed, as specified for Darwin pathnames."
|
|
1101 | 1104 | (declare (type string string))
|
1102 | 1105 | (let ((result (make-string (cond ((< (length string) 40)
|
1103 | 1106 | (* 5 (length string)))
|
... | ... | @@ -1113,8 +1116,13 @@ |
1113 | 1116 | (declare (type kernel:index i))
|
1114 | 1117 | (multiple-value-bind (code wide) (codepoint string i)
|
1115 | 1118 | (when wide (incf i))
|
1116 | - (let ((decomp (unicode-decomp code compatibility)))
|
|
1117 | - (if decomp (rec decomp 0 (length decomp)) (out code))))))
|
|
1119 | + (if (and darwinp
|
|
1120 | + (or (<= #x2000 code #x2fff)
|
|
1121 | + (<= #xf900 code #xfa6a)
|
|
1122 | + (<= #x2f800 code #x2fa1d)))
|
|
1123 | + (out code)
|
|
1124 | + (let ((decomp (unicode-decomp code compatibility)))
|
|
1125 | + (if decomp (rec decomp 0 (length decomp)) (out code)))))))
|
|
1118 | 1126 | (out (code)
|
1119 | 1127 | (multiple-value-bind (hi lo) (surrogates code)
|
1120 | 1128 | (outch hi)
|
... | ... | @@ -1151,7 +1159,7 @@ |
1151 | 1159 | (schar result (1+ last)))))
|
1152 | 1160 | (decf last (if wide2 2 1)))
|
1153 | 1161 | (t (return))))))))
|
1154 | - (with-string string
|
|
1162 | + (with-one-string string start end offset-var
|
|
1155 | 1163 | (rec string start end))
|
1156 | 1164 | (shrink-vector result fillptr))))
|
1157 | 1165 | |
... | ... | @@ -1251,12 +1259,12 @@ |
1251 | 1259 | (defun string-to-nfd (string)
|
1252 | 1260 | _N"Convert String to Unicode Normalization Form D (NFD) using the
|
1253 | 1261 | canonical decomposition. The NFD string is returned"
|
1254 | - (decompose string nil))
|
|
1262 | + (decompose string :compatibility nil))
|
|
1255 | 1263 | |
1256 | 1264 | (defun string-to-nfkd (string)
|
1257 | 1265 | _N"Convert String to Unicode Normalization Form KD (NFKD) uisng the
|
1258 | 1266 | compatible decomposition form. The NFKD string is returned."
|
1259 | - (decompose string t))
|
|
1267 | + (decompose string :compatibility t))
|
|
1260 | 1268 | |
1261 | 1269 | (defun string-to-nfc (string)
|
1262 | 1270 | _N"Convert String to Unicode Normalization Form C (NFC). If the
|
... | ... | @@ -517,3 +517,55 @@ |
517 | 517 | (if (eq casing :simple)
|
518 | 518 | (cl:string-capitalize string :start start :end end)
|
519 | 519 | (string-capitalize-full string :start start :end end :casing casing))))
|
520 | + |
|
521 | + |
|
522 | +(defun decompose-hangul-syllable (cp stream)
|
|
523 | + "Decompose the Hangul syllable codepoint CP to an equivalent sequence
|
|
524 | + of conjoining jamo and print the decomposed result to the stream
|
|
525 | + STREAM."
|
|
526 | + (let* ((s-base #xac00)
|
|
527 | + (l-base #x1100)
|
|
528 | + (v-base #x1161)
|
|
529 | + (t-base #x11a7)
|
|
530 | + (v-count 21)
|
|
531 | + (t-count 28)
|
|
532 | + (n-count (* v-count t-count)))
|
|
533 | + ;; Step 1: Compute index of the syllable S
|
|
534 | + (let ((s-index (- cp s-base)))
|
|
535 | + ;; Step 2: If s is in the range 0 <= s <= s-count, the compute
|
|
536 | + ;; the components.
|
|
537 | + (let ((l (+ l-base (truncate s-index n-count)))
|
|
538 | + (v (+ v-base (truncate (mod s-index n-count) t-count)))
|
|
539 | + (tt (+ t-base (mod s-index t-count))))
|
|
540 | + ;; Step 3: If tt = t-base, then there is no trailing character
|
|
541 | + ;; so replace s by the sequence <l,v>. Otherwise there is a
|
|
542 | + ;; trailing character, so replace s by the sequence <l,v,tt>.
|
|
543 | + (princ (code-char l) stream)
|
|
544 | + (princ (code-char v) stream)
|
|
545 | + (unless (= tt t-base)
|
|
546 | + (princ (code-char tt) stream)))))
|
|
547 | + (values))
|
|
548 | + |
|
549 | +(defun is-hangul-syllable (codepoint)
|
|
550 | + "Test if CODEPOINT is a Hangul syllable"
|
|
551 | + (let* ((s-base #xac00)
|
|
552 | + (l-count 19)
|
|
553 | + (v-count 21)
|
|
554 | + (t-count 28)
|
|
555 | + (n-count (* v-count t-count))
|
|
556 | + (number-of-syllables (* l-count n-count)))
|
|
557 | + (<= 0 (- codepoint s-base) number-of-syllables)))
|
|
558 | + |
|
559 | +(defun decompose-hangul (string)
|
|
560 | + "Decompose any Hangul syllables in STRING to an equivalent sequence of
|
|
561 | + conjoining jamo characters."
|
|
562 | + (with-output-to-string (s)
|
|
563 | + (loop for cp being the codepoints of string
|
|
564 | + do
|
|
565 | + (if (is-hangul-syllable cp)
|
|
566 | + (decompose-hangul-syllable cp s)
|
|
567 | + (multiple-value-bind (high low)
|
|
568 | + (surrogates cp)
|
|
569 | + (princ high s)
|
|
570 | + (when low
|
|
571 | + (princ low s))))))) |
... | ... | @@ -22,63 +22,68 @@ public domain. |
22 | 22 | * Feature enhancements
|
23 | 23 | * Changes
|
24 | 24 | * Update to ASDF 3.3.6
|
25 | - * The default external format is `:utf-8` instead of `:iso8859-1`
|
|
25 | + * The default external format is `:utf-8` instead of `:iso8859-1`.
|
|
26 | 26 | * ANSI compliance fixes:
|
27 | 27 | * Bug fixes:
|
28 | 28 | * ~~#97~~ Fixes stepping through the source forms in the debugger. This has been broken for quite some time, but it works now.
|
29 | 29 | |
30 | 30 | * Gitlab tickets:
|
31 | - * ~~#68~~ gcc8.1.1 can't build lisp. Change optimization from `-O2` to `-O1`
|
|
32 | - * ~~#72~~ CMU user manual now part of cmucl-site
|
|
33 | - * ~~#73~~ Update clx from upstream clx
|
|
34 | - * ~~#77~~ Added tests for sqrt for exceptional values
|
|
31 | + * ~~#68~~ gcc8.1.1 can't build lisp. Change optimization from `-O2` to `-O1`.
|
|
32 | + * ~~#72~~ CMU user manual now part of cmucl-site.
|
|
33 | + * ~~#73~~ Update clx from upstream clx.
|
|
34 | + * ~~#77~~ Added tests for sqrt for exceptional values.
|
|
35 | 35 | * ~~#79~~ Autoload ASDF when calling `REQUIRE` the first time. User's no longer have to explicitly load ASDF anymore.
|
36 | 36 | * ~~#80~~ Use ASDF to load contribs. cmu-contribs still exists but does nothing. The contrib names are the same, except it's best to use a keyword instead of a string. So, `:contrib-demos` instead of `"contrib-demos"`.
|
37 | - * ~~#81~~ Added contribs from Eric Marsden
|
|
38 | - * ~~#82~~ Replace bc with expr in GNUMakefile
|
|
39 | - * ~~#86~~ Building with gcc 8 and later works when using -O2 optimization
|
|
37 | + * ~~#81~~ Added contribs from Eric Marsden.
|
|
38 | + * ~~#82~~ Replace bc with expr in GNUMakefile.
|
|
39 | + * ~~#86~~ Building with gcc 8 and later works when using -O2 optimization.
|
|
40 | 40 | * ~~#90~~ Some static symbols have been removed. This probably makes the fasl files incompatible with older versions.
|
41 | - * ~~#91~~ Loop destructuring no longer incorrectly signals an error
|
|
42 | - * ~~#95~~ Disassembler syntax of x86 je and movzx is incorrect
|
|
41 | + * ~~#91~~ Loop destructuring no longer incorrectly signals an error.
|
|
42 | + * ~~#95~~ Disassembler syntax of x86 je and movzx is incorrect.
|
|
43 | 43 | * ~~#97~~ Define and use ud2 instruction instead of int3. Fixes single-stepping.
|
44 | - * ~~#98~~ fstpd is not an Intel instruction; disassemble as `fstp dword ptr [addr]`
|
|
44 | + * ~~#98~~ fstpd is not an Intel instruction; disassemble as `fstp dword ptr [addr]`.
|
|
45 | 45 | * ~~#100~~ ldb prints out Unicode base-chars correctly instead of just the low 8 bits.
|
46 | - * ~~#103~~ RANDOM-MT19937-UPDATE assembly routine still exists
|
|
46 | + * ~~#103~~ RANDOM-MT19937-UPDATE assembly routine still exists.
|
|
47 | 47 | * ~~#104~~ Single-stepping broken (fixed via #97).
|
48 | - * ~~#107~~ Replace u_int8_t with uint8_t
|
|
49 | - * ~~#108~~ Update ASDF
|
|
50 | - * ~~#112~~ CLX can't connect to X server via inet sockets
|
|
48 | + * ~~#107~~ Replace u_int8_t with uint8_t.
|
|
49 | + * ~~#108~~ Update ASDF.
|
|
50 | + * ~~#112~~ CLX can't connect to X server via inet sockets.
|
|
51 | 51 | * ~~#113~~ REQUIRE on contribs can pull in the wrong things via ASDF.
|
52 | - * ~~#120~~ `SOFTWARE-TYPE` and `SOFTWARE-VERSION` are implemented in C.
|
|
52 | + * ~~#120~~ `SOFTWARE-VERSION` is implemented in C.
|
|
53 | 53 | * ~~#121~~ Wrong column index in FILL-POINTER-OUTPUT-STREAM
|
54 | 54 | * ~~#122~~ gcc 11 can't build cmucl
|
55 | 55 | * ~~#124~~ directory with `:wild-inferiors` doesn't descend subdirectories
|
56 | 56 | * ~~#125~~ Linux `unix-stat` returning incorrect values
|
57 | 57 | * ~~#127~~ Linux unix-getpwuid segfaults when given non-existent uid.
|
58 | - * ~~#128~~ `QUIT` accepts an exit code
|
|
59 | - * ~~#130~~ Move file-author to C
|
|
60 | - * ~~#132~~ Ansi test `RENAME-FILE.1` no longer fails
|
|
61 | - * ~~#134~~ Handle the case of `(expt complex complex-rational)`
|
|
62 | - * ~~#136~~ `ensure-directories-exist` should return the given pathspec
|
|
63 | - * #139 `*default-external-format*` defaults to `:utf-8`; add alias for `:locale` external format
|
|
64 | - * ~~#140~~ External format for streams that are not `file-stream`'s
|
|
65 | - * ~~#141~~ Disallow locales that are pathnames to a localedef file
|
|
66 | - * ~~#142~~ `(random 0)` signals incorrect error
|
|
67 | - * ~~#147~~ `stream-line-column` method missing for `fundamental-character-output-stream`
|
|
68 | - * ~~#149~~ Call setlocale(3C) on startup
|
|
69 | - * ~~#150~~ Add aliases for external format cp949 and euckr
|
|
58 | + * ~~#128~~ `QUIT` accepts an exit code.
|
|
59 | + * ~~#130~~ Move file-author to C.
|
|
60 | + * ~~#132~~ Ansi test `RENAME-FILE.1` no longer fails.
|
|
61 | + * ~~#134~~ Handle the case of `(expt complex complex-rational)`.
|
|
62 | + * ~~#136~~ `ensure-directories-exist` should return the given pathspec.
|
|
63 | + * #139 `*default-external-format*` defaults to `:utf-8`; add alias for `:locale` external format.
|
|
64 | + * ~~#140~~ External format for streams that are not `file-stream`'s.
|
|
65 | + * ~~#141~~ Disallow locales that are pathnames to a localedef file.
|
|
66 | + * ~~#142~~ `(random 0)` signals incorrect error.
|
|
67 | + * ~~#147~~ `stream-line-column` method missing for `fundamental-character-output-stream`.
|
|
68 | + * ~~#149~~ Call setlocale(3C) on startup.
|
|
69 | + * ~~#150~~ Add aliases for external format cp949 and euckr.
|
|
70 | 70 | * ~~#151~~ Change `*default-external-format*` to `:utf-8`.
|
71 | - * ~~#155~~ Wrap help strings neatly
|
|
72 | - * ~~#157~~ `(directory "foo/**/")` only returns directories now
|
|
73 | - * ~~#163~~ Add command-line option `-version` and `--version` to get lisp version
|
|
74 | - * ~~#165~~ Avoid inserting NIL into simple `LOOP` from `FORMAT`
|
|
75 | - * ~~#166~~ Fix incorrect type declaration for exponent from `integer-decode-float`
|
|
71 | + * ~~#152~~ Add new external format, `:locale` as an alias to the codeset from LANG and friends.
|
|
72 | + * ~~#!53~~ Terminals default to an encoding of `:locale`.
|
|
73 | + * ~~#155~~ Wrap help strings neatly.
|
|
74 | + * ~~#157~~ `(directory "foo/**/")` only returns directories now.
|
|
75 | + * #158 Darwin uses utf-8, but we don't support all the rules for pathnames.
|
|
76 | + * ~~#162~~ `*filename-encoding*` defaults to `:null` to mean no encoding.
|
|
77 | + * ~~#163~~ Add command-line option `-version` and `--version` to get lisp version.
|
|
78 | + * ~~#165~~ Avoid inserting NIL into simple `LOOP` from `FORMAT`.
|
|
79 | + * ~~#166~~ Fix incorrect type declaration for exponent from `integer-decode-float`.
|
|
76 | 80 | * ~~#167~~ Low bound for `decode-float-exponent` type was off by one.
|
77 | - * ~~#168~~ Don't use negated forms for jmp instructions when possible
|
|
78 | - * ~~#169~~ Add pprinter for `define-vop` and `sc-case`
|
|
81 | + * ~~#168~~ Don't use negated forms for jmp instructions when possible.
|
|
82 | + * ~~#169~~ Add pprinter for `define-vop` and `sc-case`.
|
|
79 | 83 | * ~~#172~~ Declare `pathname-match-p` as returning `nil` or `pathname`.
|
80 | - * ~~#173~~ Add pprinter for `define-assembly-routine`
|
|
84 | + * ~~#173~~ Add pprinter for `define-assembly-routine`.
|
|
81 | 85 | * ~~#176~~ `SHORT-SITE-NAME` and `LONG-SITE-NAME` return `NIL`.
|
86 | + * ~~#177~~ Add pprinter for `deftransform` and `defoptimizer`.
|
|
82 | 87 | * Other changes:
|
83 | 88 | * Improvements to the PCL implementation of CLOS:
|
84 | 89 | * Changes to building procedure:
|
... | ... | @@ -4012,6 +4012,14 @@ msgid "" |
4012 | 4012 | " string is returned."
|
4013 | 4013 | msgstr ""
|
4014 | 4014 | |
4015 | +#: src/code/string.lisp
|
|
4016 | +msgid ""
|
|
4017 | +"Convert String to NFD (or NFKD). If :darwinp is non-NIL, then\n"
|
|
4018 | +" characters in the ranges U2000-U2FFF, UF900-UFA6A, and\n"
|
|
4019 | +" U2F800-U2FA1D are not decomposed, as specified for Darwin\n"
|
|
4020 | +" pathnames."
|
|
4021 | +msgstr ""
|
|
4022 | + |
|
4015 | 4023 | #: src/code/string.lisp
|
4016 | 4024 | msgid ""
|
4017 | 4025 | "Convert a sequence of codepoints to a string. Codepoints outside\n"
|
... | ... | @@ -15267,6 +15275,23 @@ msgid "" |
15267 | 15275 | " delimited by non-case-modifiable chars. "
|
15268 | 15276 | msgstr ""
|
15269 | 15277 | |
15278 | +#: src/code/unicode.lisp
|
|
15279 | +msgid ""
|
|
15280 | +"Decompose the Hangul syllable codepoint CP to an equivalent sequence\n"
|
|
15281 | +" of conjoining jamo and print the decomposed result to the stream\n"
|
|
15282 | +" STREAM."
|
|
15283 | +msgstr ""
|
|
15284 | + |
|
15285 | +#: src/code/unicode.lisp
|
|
15286 | +msgid "Test if CODEPOINT is a Hangul syllable"
|
|
15287 | +msgstr ""
|
|
15288 | + |
|
15289 | +#: src/code/unicode.lisp
|
|
15290 | +msgid ""
|
|
15291 | +"Decompose any Hangul syllables in STRING to an equivalent sequence of\n"
|
|
15292 | +" conjoining jamo characters."
|
|
15293 | +msgstr ""
|
|
15294 | + |
|
15270 | 15295 | #: src/compiler/macros.lisp
|
15271 | 15296 | msgid ""
|
15272 | 15297 | "Policy Node Condition*\n"
|
... | ... | @@ -832,6 +832,54 @@ |
832 | 832 | |
833 | 833 | |
834 | 834 | |
835 | +(define-test issue.158
|
|
836 | + (:tag :issues)
|
|
837 | + (let* ((name (string #\Hangul_Syllable_Gyek))
|
|
838 | + (path (make-pathname :directory (list :relative name)
|
|
839 | + :name name
|
|
840 | + :type name)))
|
|
841 | + ;; Enable this when we implement normalization for Darwin
|
|
842 | + #+(and nil darwin)
|
|
843 | + (let ((expected '(4352 4456 4543)))
|
|
844 | + ;; Tests that on Darwin the Hangul pathname has been normalized
|
|
845 | + ;; correctly. We fill in the directory, name, and type components
|
|
846 | + ;; with the same thing since it shouldn't really matter.
|
|
847 | + ;;
|
|
848 | + ;; The expected value is the conjoining jamo for the character
|
|
849 | + ;; #\Hangul_Syllable_Gyek.
|
|
850 | + (assert-equal (map 'list #'char-code (second (pathname-directory path)))
|
|
851 | + expected)
|
|
852 | + (assert-equal (map 'list #'char-code (pathname-name path))
|
|
853 | + expected)
|
|
854 | + (assert-equal (map 'list #'char-code (pathname-type path))
|
|
855 | + expected))
|
|
856 | + #-darwin
|
|
857 | + (let ((expected (list (char-code #\Hangul_Syllable_Gyek))))
|
|
858 | + ;; For other OSes, just assume that the pathname is unchanged.
|
|
859 | + (assert-equal (map 'list #'char-code (second (pathname-directory path)))
|
|
860 | + expected)
|
|
861 | + (assert-equal (map 'list #'char-code (pathname-name path))
|
|
862 | + expected)
|
|
863 | + (assert-equal (map 'list #'char-code (pathname-type path))
|
|
864 | + expected))))
|
|
865 | + |
|
866 | +(define-test issue.158.dir
|
|
867 | + (:tag :issues)
|
|
868 | + (flet ((get-file ()
|
|
869 | + ;; This assumes that there is only one file in resources/darwin
|
|
870 | + (let ((files (directory (merge-pathnames "resources/darwin/*.txt" *test-path*))))
|
|
871 | + (assert-equal (length files) 1)
|
|
872 | + (first files))))
|
|
873 | + (let ((f (get-file))
|
|
874 | + (expected-name "안녕하십니까"))
|
|
875 | + #+darwin
|
|
876 | + (assert-equal (pathname-name f)
|
|
877 | + (unicode::decompose-hangul expected-name))
|
|
878 | + #-darwin
|
|
879 | + (assert-equal (pathname-name f) expected-name))))
|
|
880 | +
|
|
881 | + |
|
882 | + |
|
835 | 883 | (define-test issue.166
|
836 | 884 | (:tag :issues)
|
837 | 885 | ;; While this tests for the correct return value, the problem was
|
... | ... | @@ -896,4 +944,3 @@ |
896 | 944 | (assert-true (typep idf-max-expo 'kernel:double-float-int-exponent))
|
897 | 945 | (assert-true (typep (1- idf-max-expo) 'kernel:double-float-int-exponent))
|
898 | 946 | (assert-false (typep (1+ idf-max-expo) 'kernel:double-float-int-exponent)))) |
899 | - |
1 | +The file name of this file is "안녕하십니까.txt" ("Hello" in Korean.)
|
|
2 | + |
|
3 | + |