| ... |
... |
@@ -315,178 +315,155 @@ |
|
315
|
315
|
character extends to the end of S. If the index is negative or
|
|
316
|
316
|
valid index into S, the returned value will be strictly greater than
|
|
317
|
317
|
the index."
|
|
|
318
|
+ ;; Implements the word-boundary rules of UAX #29 (Unicode 17.0).
|
|
|
319
|
+ ;;
|
|
|
320
|
+ ;; Decode S into codepoints (collapsing UTF-16 surrogate pairs),
|
|
|
321
|
+ ;; classify each by its word-break property, then scan boundaries
|
|
|
322
|
+ ;; left to right applying the WB rules, returning the first boundary
|
|
|
323
|
+ ;; whose string index is strictly greater than I.
|
|
|
324
|
+ ;;
|
|
|
325
|
+ ;; WB4 makes Extend, Format and ZWJ "ignorable": they attach to the
|
|
|
326
|
+ ;; preceding context, so most rules compare a candidate against the
|
|
|
327
|
+ ;; nearest non-ignorable element on each side rather than the literal
|
|
|
328
|
+ ;; neighbour. ZWJ is also significant for WB3c, and Extend can sit
|
|
|
329
|
+ ;; between paired Regional_Indicators, so those rules look past
|
|
|
330
|
+ ;; ignorables explicitly.
|
|
|
331
|
+ (declare (type simple-string s))
|
|
318
|
332
|
(let ((n (length s)))
|
|
319
|
|
- (labels
|
|
320
|
|
- ((char-word-break-category (c)
|
|
321
|
|
- ;; Map our unicode word break property into what this
|
|
322
|
|
- ;; algorithm wants.
|
|
323
|
|
- (let ((cat (unicode-word-break c)))
|
|
324
|
|
- (case cat
|
|
325
|
|
- ((:lf :cr :newline)
|
|
326
|
|
- :sep)
|
|
327
|
|
- ((:extend :format)
|
|
328
|
|
- :extend-or-format)
|
|
329
|
|
- (otherwise cat))))
|
|
330
|
|
- (left-context (i)
|
|
331
|
|
- ;; Given a valid index i into s, returns the left context
|
|
332
|
|
- ;; at i.
|
|
333
|
|
- (multiple-value-bind (c widep)
|
|
334
|
|
- (codepoint s i n)
|
|
335
|
|
- (let* ((back
|
|
336
|
|
- ;; If we're at a regular character or a leading
|
|
337
|
|
- ;; surrogate, decrementing by 1 gets us the to
|
|
338
|
|
- ;; previous character. But for a trailing
|
|
339
|
|
- ;; surrogate, we need to decrement by 2!
|
|
340
|
|
- (if (eql widep -1)
|
|
341
|
|
- 2
|
|
342
|
|
- 1))
|
|
343
|
|
- (cat (char-word-break-category c)))
|
|
344
|
|
- (case cat
|
|
345
|
|
- ((:sep)
|
|
346
|
|
- (if (= c (char-code #\return)) :cr cat))
|
|
347
|
|
- ((:midletter :midnumlet)
|
|
348
|
|
- (let ((i-1 (- i back)))
|
|
349
|
|
- (if (and (<= 0 i-1)
|
|
350
|
|
- (eq (left-context i-1) :aletter))
|
|
351
|
|
- :aletter-midletter
|
|
352
|
|
- cat)))
|
|
353
|
|
- ((:midnum :midnumlet)
|
|
354
|
|
- (let ((i-1 (- i back)))
|
|
355
|
|
- (if (and (<= 0 i-1)
|
|
356
|
|
- (eq (left-context i-1) :numeric))
|
|
357
|
|
- :numeric-midnum
|
|
358
|
|
- cat)))
|
|
359
|
|
- ((:extendorformat)
|
|
360
|
|
- (if (< 0 i)
|
|
361
|
|
- (left-context (- i back))
|
|
362
|
|
- :other))
|
|
363
|
|
- (otherwise cat)))))
|
|
364
|
|
-
|
|
365
|
|
- (index-of-previous-non-ignored (j)
|
|
366
|
|
- ;; Returns the index of the last non-Extend, non-Format
|
|
367
|
|
- ;; character within (substring s 0 j). Should not be
|
|
368
|
|
- ;; called unless such a character exists.
|
|
369
|
|
-
|
|
370
|
|
- (let* ((j1 (- j 1)))
|
|
371
|
|
- (multiple-value-bind (c widep)
|
|
372
|
|
- (codepoint s j1)
|
|
373
|
|
- (when (eql widep -1)
|
|
374
|
|
- ;; Back up one more if we're at the trailing
|
|
375
|
|
- ;; surrogate.
|
|
376
|
|
- (decf j1))
|
|
377
|
|
- (let ((cat (char-word-break-category c)))
|
|
378
|
|
- (case cat
|
|
379
|
|
- ((:extend-or-format)
|
|
380
|
|
- (index-of-previous-non-ignored j1))
|
|
381
|
|
- (otherwise j1))))))
|
|
382
|
|
-
|
|
383
|
|
- (lookup (j context)
|
|
384
|
|
- ;; Given j and the context to the left of (not including) j,
|
|
385
|
|
- ;; returns the index at the start of the next word
|
|
386
|
|
- ;; (or before which a word break is permitted).
|
|
387
|
|
-
|
|
388
|
|
- (if (>= j n)
|
|
389
|
|
- (case context
|
|
390
|
|
- ((:aletter-midletter :numeric-midnum)
|
|
391
|
|
- (let ((j (index-of-previous-non-ignored n)))
|
|
392
|
|
- (if (< i j) j n)))
|
|
393
|
|
- (otherwise n))
|
|
394
|
|
- (multiple-value-bind (c widep)
|
|
395
|
|
- (codepoint s j)
|
|
396
|
|
- (let* ((next-j
|
|
397
|
|
- ;; The next character is either 1 or 2 code
|
|
398
|
|
- ;; units away. For a leading surrogate, it's
|
|
399
|
|
- ;; 2; Otherwise just 1.
|
|
400
|
|
- (if (eql widep 1)
|
|
401
|
|
- 2
|
|
402
|
|
- 1))
|
|
403
|
|
- (cat (char-word-break-category c)))
|
|
404
|
|
- (case cat
|
|
405
|
|
- ((:extend-or-format)
|
|
406
|
|
- (case context
|
|
407
|
|
- ((:cr :sep) j)
|
|
408
|
|
- (otherwise (lookup (+ j next-j) context))))
|
|
409
|
|
- (otherwise
|
|
410
|
|
- (case context
|
|
411
|
|
- ((:cr)
|
|
412
|
|
- (if (= c (char-code #\linefeed))
|
|
413
|
|
- ;; Rule WB3: Don't break CRLF, continue looking
|
|
414
|
|
- (lookup (+ j next-j) cat)
|
|
415
|
|
- j))
|
|
416
|
|
- ((:aletter)
|
|
417
|
|
- (case cat
|
|
418
|
|
- ((:aletter :numeric :extendnumlet)
|
|
419
|
|
- ;; Rules WB5, WB9, ?
|
|
420
|
|
- (lookup (+ j next-j) cat))
|
|
421
|
|
- ((:midletter :midnumlet)
|
|
422
|
|
- ;; Rule WB6, need to keep looking
|
|
423
|
|
- (lookup (+ j next-j) :aletter-midletter))
|
|
424
|
|
- (otherwise j)))
|
|
425
|
|
- ((:aletter-midletter)
|
|
426
|
|
- (case cat
|
|
427
|
|
- ((:aletter)
|
|
428
|
|
- ;; Rule WB7
|
|
429
|
|
- (lookup (+ j next-j) cat))
|
|
430
|
|
- (otherwise
|
|
431
|
|
- ;; Rule WB6 and WB7 were extended, but the
|
|
432
|
|
- ;; region didn't end with :aletter. So
|
|
433
|
|
- ;; backup and break at that point.
|
|
434
|
|
- (let ((j2 (index-of-previous-non-ignored j)))
|
|
435
|
|
- (if (< i j2) j2 j)))))
|
|
436
|
|
- ((:numeric)
|
|
437
|
|
- (case cat
|
|
438
|
|
- ((:numeric :aletter :extendnumlet)
|
|
439
|
|
- ;; Rules WB8, WB10, ?
|
|
440
|
|
- (lookup (+ j next-j) cat))
|
|
441
|
|
- ((:midnum :midnumlet)
|
|
442
|
|
- ;; Rules WB11, need to keep looking
|
|
443
|
|
- (lookup (+ j next-j) :numeric-midnum))
|
|
444
|
|
- (otherwise j)))
|
|
445
|
|
- ((:numeric-midnum)
|
|
446
|
|
- (case cat
|
|
447
|
|
- ((:numeric)
|
|
448
|
|
- ;; Rule WB11, keep looking
|
|
449
|
|
- (lookup (+ j next-j) cat))
|
|
450
|
|
- (otherwise
|
|
451
|
|
- ;; Rule WB11, WB12 were extended, but the
|
|
452
|
|
- ;; region didn't end with :numeric, so
|
|
453
|
|
- ;; backup and break at that point.
|
|
454
|
|
- (let ((j2 (index-of-previous-non-ignored j)))
|
|
455
|
|
- (if (< i j2) j2 j)))))
|
|
456
|
|
- ((:midletter :midnum :midnumlet)
|
|
457
|
|
- ;; Rule WB14
|
|
458
|
|
- j)
|
|
459
|
|
- ((:katakana)
|
|
460
|
|
- (case cat
|
|
461
|
|
- ((:katakana :extendnumlet)
|
|
462
|
|
- ;; Rule WB13, WB13a
|
|
463
|
|
- (lookup (+ j next-j) cat))
|
|
464
|
|
- (otherwise j)))
|
|
465
|
|
- ((:extendnumlet)
|
|
466
|
|
- (case cat
|
|
467
|
|
- ((:extendnumlet :aletter :numeric :katakana)
|
|
468
|
|
- ;; Rule WB13a, WB13b
|
|
469
|
|
- (lookup (+ j next-j) cat))
|
|
470
|
|
- (otherwise j)))
|
|
471
|
|
- ((:regional_indicator)
|
|
472
|
|
- (case cat
|
|
473
|
|
- ((:regional_indicator)
|
|
474
|
|
- ;; Rule WB13c
|
|
475
|
|
- (lookup (+ j next-j) cat))
|
|
476
|
|
- (otherwise j)))
|
|
477
|
|
- (otherwise j)))))))))
|
|
478
|
|
- (declare (notinline lookup left-context))
|
|
479
|
|
- (cond ((< i 0)
|
|
480
|
|
- ;; Rule WB1
|
|
481
|
|
- 0)
|
|
482
|
|
- ((<= n i)
|
|
483
|
|
- ;; Rule WB2
|
|
484
|
|
- n)
|
|
485
|
|
- (t
|
|
486
|
|
- (multiple-value-bind (c widep)
|
|
487
|
|
- (codepoint s i)
|
|
488
|
|
- (declare (ignore c))
|
|
489
|
|
- (lookup (+ i (if (eql widep 1) 2 1)) (left-context i))))))))
|
|
|
333
|
+ (cond
|
|
|
334
|
+ ((< i 0) 0) ; Rule WB1 (start of text)
|
|
|
335
|
+ ((>= i n) n) ; Rule WB2 (end of text)
|
|
|
336
|
+ (t
|
|
|
337
|
+ (let ((cls (make-array 16 :fill-pointer 0 :adjustable t))
|
|
|
338
|
+ (idx (make-array 16 :fill-pointer 0 :adjustable t)))
|
|
|
339
|
+ ;; Decode codepoints and record the string index of each.
|
|
|
340
|
+ (let ((k 0))
|
|
|
341
|
+ (loop while (< k n) do
|
|
|
342
|
+ (multiple-value-bind (cp widep) (codepoint s k)
|
|
|
343
|
+ (vector-push-extend (unicode-word-break cp) cls)
|
|
|
344
|
+ (vector-push-extend k idx)
|
|
|
345
|
+ (incf k (if (eql widep 1) 2 1)))))
|
|
|
346
|
+ (let ((m (fill-pointer cls)))
|
|
|
347
|
+ (labels
|
|
|
348
|
+ ((class (j) (aref cls j))
|
|
|
349
|
+ (ah-letter-p (c) (or (eq c :aletter) (eq c :hebrew_letter)))
|
|
|
350
|
+ (mid-letter-q-p (c) ; (MidLetter | MidNumLetQ)
|
|
|
351
|
+ (or (eq c :midletter) (eq c :midnumlet) (eq c :single_quote)))
|
|
|
352
|
+ (mid-num-q-p (c) ; (MidNum | MidNumLetQ)
|
|
|
353
|
+ (or (eq c :midnum) (eq c :midnumlet) (eq c :single_quote)))
|
|
|
354
|
+ (ignorable (c) ; WB4: Extend | Format | ZWJ
|
|
|
355
|
+ (or (eq c :extend) (eq c :format) (eq c :zwj)))
|
|
|
356
|
+ (ext-pict-at (j)
|
|
|
357
|
+ (unicode-extended-pictographic-p (codepoint s (aref idx j))))
|
|
|
358
|
+ (prev-significant (j) ; last non-ignorable element < J, or -1
|
|
|
359
|
+ (loop for p from (1- j) downto 0
|
|
|
360
|
+ unless (ignorable (class p)) return p
|
|
|
361
|
+ finally (return -1)))
|
|
|
362
|
+ (next-significant (j) ; first non-ignorable element > J, or -1
|
|
|
363
|
+ (loop for q from (1+ j) below m
|
|
|
364
|
+ unless (ignorable (class q)) return q
|
|
|
365
|
+ finally (return -1)))
|
|
|
366
|
+ (ri-count-left (j)
|
|
|
367
|
+ ;; Number of significant Regional_Indicator elements
|
|
|
368
|
+ ;; immediately to the left of J (stopping at the first
|
|
|
369
|
+ ;; non-RI significant element; ignorables are skipped).
|
|
|
370
|
+ (let ((count 0))
|
|
|
371
|
+ (loop for p = (prev-significant j) then (prev-significant p)
|
|
|
372
|
+ while (and (>= p 0) (eq (class p) :regional_indicator))
|
|
|
373
|
+ do (incf count))
|
|
|
374
|
+ count))
|
|
|
375
|
+ (break-before-p (j)
|
|
|
376
|
+ ;; Does the algorithm allow a break immediately before
|
|
|
377
|
+ ;; element J (1 <= J < M)?
|
|
|
378
|
+ (let* ((c (class j))
|
|
|
379
|
+ (lit (class (1- j))) ; literal previous element
|
|
|
380
|
+ (pj (prev-significant j)))
|
|
|
381
|
+ (cond
|
|
|
382
|
+ ;; WB3: CR x LF
|
|
|
383
|
+ ((and (eq lit :cr) (eq c :lf)) nil)
|
|
|
384
|
+ ;; WB3a: (Newline | CR | LF) div
|
|
|
385
|
+ ((member lit '(:newline :cr :lf)) t)
|
|
|
386
|
+ ;; WB3b: div (Newline | CR | LF)
|
|
|
387
|
+ ((member c '(:newline :cr :lf)) t)
|
|
|
388
|
+ ;; WB3c: ZWJ x \p{Extended_Pictographic}
|
|
|
389
|
+ ((and (eq lit :zwj) (ext-pict-at j)) nil)
|
|
|
390
|
+ ;; WB3d: WSegSpace x WSegSpace
|
|
|
391
|
+ ((and (eq lit :wsegspace) (eq c :wsegspace)) nil)
|
|
|
392
|
+ ;; WB4: x (Extend | Format | ZWJ): never break before
|
|
|
393
|
+ ;; an ignorable (covers ignorables after sot or after
|
|
|
394
|
+ ;; another ignorable too).
|
|
|
395
|
+ ((ignorable c) nil)
|
|
|
396
|
+ ;; Nothing significant to the left: break.
|
|
|
397
|
+ ((< pj 0) t)
|
|
|
398
|
+ (t
|
|
|
399
|
+ (let ((p (class pj)))
|
|
|
400
|
+ (cond
|
|
|
401
|
+ ;; WB5: AHLetter x AHLetter
|
|
|
402
|
+ ((and (ah-letter-p p) (ah-letter-p c)) nil)
|
|
|
403
|
+ ;; WB6: AHLetter x (MidLetter|MidNumLetQ) AHLetter
|
|
|
404
|
+ ((and (ah-letter-p p) (mid-letter-q-p c)
|
|
|
405
|
+ (let ((nx (next-significant j)))
|
|
|
406
|
+ (and (>= nx 0) (ah-letter-p (class nx)))))
|
|
|
407
|
+ nil)
|
|
|
408
|
+ ;; WB7: AHLetter (MidLetter|MidNumLetQ) x AHLetter
|
|
|
409
|
+ ((and (ah-letter-p c) (mid-letter-q-p p)
|
|
|
410
|
+ (let ((pp (prev-significant pj)))
|
|
|
411
|
+ (and (>= pp 0) (ah-letter-p (class pp)))))
|
|
|
412
|
+ nil)
|
|
|
413
|
+ ;; WB7a: Hebrew_Letter x Single_Quote
|
|
|
414
|
+ ((and (eq p :hebrew_letter) (eq c :single_quote)) nil)
|
|
|
415
|
+ ;; WB7b: Hebrew_Letter x Double_Quote Hebrew_Letter
|
|
|
416
|
+ ((and (eq p :hebrew_letter) (eq c :double_quote)
|
|
|
417
|
+ (let ((nx (next-significant j)))
|
|
|
418
|
+ (and (>= nx 0) (eq (class nx) :hebrew_letter))))
|
|
|
419
|
+ nil)
|
|
|
420
|
+ ;; WB7c: Hebrew_Letter Double_Quote x Hebrew_Letter
|
|
|
421
|
+ ((and (eq c :hebrew_letter) (eq p :double_quote)
|
|
|
422
|
+ (let ((pp (prev-significant pj)))
|
|
|
423
|
+ (and (>= pp 0) (eq (class pp) :hebrew_letter))))
|
|
|
424
|
+ nil)
|
|
|
425
|
+ ;; WB8: Numeric x Numeric
|
|
|
426
|
+ ((and (eq p :numeric) (eq c :numeric)) nil)
|
|
|
427
|
+ ;; WB9: AHLetter x Numeric
|
|
|
428
|
+ ((and (ah-letter-p p) (eq c :numeric)) nil)
|
|
|
429
|
+ ;; WB10: Numeric x AHLetter
|
|
|
430
|
+ ((and (eq p :numeric) (ah-letter-p c)) nil)
|
|
|
431
|
+ ;; WB11: Numeric (MidNum|MidNumLetQ) x Numeric
|
|
|
432
|
+ ((and (eq c :numeric) (mid-num-q-p p)
|
|
|
433
|
+ (let ((pp (prev-significant pj)))
|
|
|
434
|
+ (and (>= pp 0) (eq (class pp) :numeric))))
|
|
|
435
|
+ nil)
|
|
|
436
|
+ ;; WB12: Numeric x (MidNum|MidNumLetQ) Numeric
|
|
|
437
|
+ ((and (eq p :numeric) (mid-num-q-p c)
|
|
|
438
|
+ (let ((nx (next-significant j)))
|
|
|
439
|
+ (and (>= nx 0) (eq (class nx) :numeric))))
|
|
|
440
|
+ nil)
|
|
|
441
|
+ ;; WB13: Katakana x Katakana
|
|
|
442
|
+ ((and (eq p :katakana) (eq c :katakana)) nil)
|
|
|
443
|
+ ;; WB13a: (AHLetter|Numeric|Katakana|ExtendNumLet) x ExtendNumLet
|
|
|
444
|
+ ((and (member p '(:aletter :hebrew_letter :numeric
|
|
|
445
|
+ :katakana :extendnumlet))
|
|
|
446
|
+ (eq c :extendnumlet))
|
|
|
447
|
+ nil)
|
|
|
448
|
+ ;; WB13b: ExtendNumLet x (AHLetter|Numeric|Katakana)
|
|
|
449
|
+ ((and (eq p :extendnumlet)
|
|
|
450
|
+ (or (ah-letter-p c)
|
|
|
451
|
+ (member c '(:numeric :katakana))))
|
|
|
452
|
+ nil)
|
|
|
453
|
+ ;; WB15/WB16: in a Regional_Indicator run, break
|
|
|
454
|
+ ;; only between pairs: break before J iff an even
|
|
|
455
|
+ ;; number of RIs precede it.
|
|
|
456
|
+ ((and (eq p :regional_indicator)
|
|
|
457
|
+ (eq c :regional_indicator))
|
|
|
458
|
+ (evenp (ri-count-left j)))
|
|
|
459
|
+ ;; WB999: otherwise break.
|
|
|
460
|
+ (t t))))))))
|
|
|
461
|
+ ;; Find the first allowed boundary whose string index is > I.
|
|
|
462
|
+ (loop for j from 1 below m
|
|
|
463
|
+ when (and (> (aref idx j) i) (break-before-p j))
|
|
|
464
|
+ do (return-from string-next-word-break (aref idx j)))
|
|
|
465
|
+ ;; WB2: otherwise the word extends to end of text.
|
|
|
466
|
+ n)))))))
|
|
490
|
467
|
|
|
491
|
468
|
(defun char-titlecase (char)
|
|
492
|
469
|
"Returns CHAR converted to title-case if that is possible."
|