| ... |
... |
@@ -55,6 +55,15 @@ |
|
55
|
55
|
;; of collation-elements. Single characters and contractions share
|
|
56
|
56
|
;; this table; the key length distinguishes them.
|
|
57
|
57
|
(map (make-hash-table :test 'equalp) :type hash-table)
|
|
|
58
|
+ ;; Fast paths derived from MAP at load time. SINGLE maps a single
|
|
|
59
|
+ ;; codepoint (a fixnum) directly to its collation elements, avoiding an
|
|
|
60
|
+ ;; EQUALP probe with a one-element key for the common case. STARTERS
|
|
|
61
|
+ ;; holds every codepoint that begins some multi-codepoint key; a
|
|
|
62
|
+ ;; codepoint not in STARTERS can neither begin a contraction nor be the
|
|
|
63
|
+ ;; base of a discontiguous match, so the contraction machinery is
|
|
|
64
|
+ ;; skipped for it entirely.
|
|
|
65
|
+ (single (make-hash-table :test 'eql) :type hash-table)
|
|
|
66
|
+ (starters (make-hash-table :test 'eql) :type hash-table)
|
|
58
|
67
|
;; Implicit-weight ranges, in file order.
|
|
59
|
68
|
(implicit-ranges nil :type list)
|
|
60
|
69
|
;; Longest key (in codepoints) present in MAP; bounds the contraction
|
| ... |
... |
@@ -180,6 +189,13 @@ returning an IMPLICIT-RANGE, or NIL if it does not parse." |
|
180
|
189
|
do (setf (ducet-max-variable-primary d)
|
|
181
|
190
|
(ce-primary ce))))))))))))
|
|
182
|
191
|
(setf (ducet-implicit-ranges d) (nreverse (ducet-implicit-ranges d)))
|
|
|
192
|
+ ;; Build the fast-path tables: SINGLE for one-codepoint keys, and
|
|
|
193
|
+ ;; STARTERS for the first codepoint of every multi-codepoint key.
|
|
|
194
|
+ (maphash (lambda (key ces)
|
|
|
195
|
+ (if (= (length key) 1)
|
|
|
196
|
+ (setf (gethash (aref key 0) (ducet-single d)) ces)
|
|
|
197
|
+ (setf (gethash (aref key 0) (ducet-starters d)) t)))
|
|
|
198
|
+ (ducet-map d))
|
|
183
|
199
|
;; Compute, for each base, the smallest range start sharing it, and
|
|
184
|
200
|
;; record it as the BBBB offset origin on every range with that base.
|
|
185
|
201
|
(let ((origin (make-hash-table)))
|
| ... |
... |
@@ -298,74 +314,106 @@ the blocking threshold for later non-starters." |
|
298
|
314
|
(consumed (make-array (max 1 (length cps)) :initial-element nil))
|
|
299
|
315
|
(i 0)
|
|
300
|
316
|
(n (length cps))
|
|
301
|
|
- (maxlen (ducet-max-key-length d)))
|
|
|
317
|
+ (maxlen (ducet-max-key-length d))
|
|
|
318
|
+ (map (ducet-map d))
|
|
|
319
|
+ (single (ducet-single d))
|
|
|
320
|
+ (starters (ducet-starters d)))
|
|
302
|
321
|
(loop while (< i n) do
|
|
303
|
322
|
(cond
|
|
304
|
323
|
((aref consumed i) (incf i)) ; folded in by a discontiguous match
|
|
305
|
324
|
(t
|
|
306
|
|
- (let ((best-key nil) (best-len 0) (best-ces nil))
|
|
307
|
|
- ;; S2.1: longest contiguous match starting at I, over positions
|
|
308
|
|
- ;; not already consumed by an earlier discontiguous fold. A
|
|
309
|
|
- ;; multi-codepoint key may only match an unbroken run of
|
|
310
|
|
- ;; still-present codepoints, so a span containing a consumed
|
|
311
|
|
- ;; position (a "hole" left by a fold) is rejected.
|
|
312
|
|
- (loop for len from (min maxlen (- n i)) downto 1 do
|
|
313
|
|
- (when (loop for j from i below (+ i len)
|
|
314
|
|
- never (aref consumed j))
|
|
315
|
|
- (let* ((key (subseq cps i (+ i len)))
|
|
316
|
|
- (ces (gethash key (ducet-map d))))
|
|
317
|
|
- (when ces
|
|
318
|
|
- (setf best-key key best-len len best-ces ces)
|
|
319
|
|
- (return)))))
|
|
|
325
|
+ (let* ((cp (aref cps i))
|
|
|
326
|
+ (starter (gethash cp starters))
|
|
|
327
|
+ (best-key nil) (best-len 0) (best-ces nil))
|
|
|
328
|
+ ;; S2.1: longest contiguous match starting at I. Only a
|
|
|
329
|
+ ;; codepoint that begins some multi-codepoint key (a STARTER)
|
|
|
330
|
+ ;; can match a contraction, so the multi-length scan is done
|
|
|
331
|
+ ;; only for those; every other codepoint takes the SINGLE
|
|
|
332
|
+ ;; fast path. The scan rejects any span containing a position
|
|
|
333
|
+ ;; already consumed by an earlier discontiguous fold.
|
|
|
334
|
+ (when starter
|
|
|
335
|
+ (loop for len from (min maxlen (- n i)) downto 2 do
|
|
|
336
|
+ (when (loop for j from i below (+ i len)
|
|
|
337
|
+ never (aref consumed j))
|
|
|
338
|
+ (let* ((key (subseq cps i (+ i len)))
|
|
|
339
|
+ (ces (gethash key map)))
|
|
|
340
|
+ (when ces
|
|
|
341
|
+ (setf best-key key best-len len best-ces ces)
|
|
|
342
|
+ (return))))))
|
|
|
343
|
+ (unless best-ces
|
|
|
344
|
+ (let ((ces (gethash cp single)))
|
|
|
345
|
+ (when ces
|
|
|
346
|
+ (setf best-ces ces best-len 1)
|
|
|
347
|
+ ;; BEST-KEY is needed only to extend by discontiguous
|
|
|
348
|
+ ;; folds, which apply only to starters.
|
|
|
349
|
+ (when starter
|
|
|
350
|
+ (setf best-key (make-array 1 :initial-element cp))))))
|
|
320
|
351
|
(cond
|
|
321
|
|
- (best-key
|
|
|
352
|
+ (best-ces
|
|
322
|
353
|
;; S2.1.1-S2.1.3: extend by unblocked following non-starters.
|
|
|
354
|
+ ;; A codepoint that is not a STARTER can begin no multi-key,
|
|
|
355
|
+ ;; so no fold is possible and the scan is skipped entirely.
|
|
323
|
356
|
;; MAXCCC is the highest combining class among still-present
|
|
324
|
357
|
;; codepoints passed since the end of the contiguous match.
|
|
325
|
|
- (let ((maxccc 0))
|
|
326
|
|
- (loop for k from (+ i best-len) below n do
|
|
327
|
|
- (unless (aref consumed k)
|
|
328
|
|
- (let ((ccc (lisp::unicode-combining-class (aref cps k))))
|
|
329
|
|
- (cond
|
|
330
|
|
- ((zerop ccc)
|
|
331
|
|
- (return)) ; starter blocks all further
|
|
332
|
|
- ((> ccc maxccc) ; unblocked candidate
|
|
333
|
|
- (let* ((cand (collation-extend-key
|
|
334
|
|
- best-key (aref cps k)))
|
|
335
|
|
- (ces (gethash cand (ducet-map d))))
|
|
336
|
|
- (if ces
|
|
337
|
|
- ;; Fold C in and remove it; MAXCCC unchanged
|
|
338
|
|
- ;; since C is no longer present.
|
|
339
|
|
- (setf best-key cand
|
|
340
|
|
- best-ces ces
|
|
341
|
|
- (aref consumed k) t)
|
|
342
|
|
- ;; No match: C stays and becomes a blocker.
|
|
343
|
|
- (setf maxccc ccc))))
|
|
344
|
|
- (t
|
|
345
|
|
- ;; Blocked (ccc <= maxccc): C stays; MAXCCC already
|
|
346
|
|
- ;; covers it.
|
|
347
|
|
- nil))))))
|
|
|
358
|
+ (when (and starter best-key)
|
|
|
359
|
+ (let ((maxccc 0))
|
|
|
360
|
+ (loop for k from (+ i best-len) below n do
|
|
|
361
|
+ (unless (aref consumed k)
|
|
|
362
|
+ (let ((ccc (lisp::unicode-combining-class (aref cps k))))
|
|
|
363
|
+ (cond
|
|
|
364
|
+ ((zerop ccc)
|
|
|
365
|
+ (return)) ; starter blocks all further
|
|
|
366
|
+ ((> ccc maxccc) ; unblocked candidate
|
|
|
367
|
+ (let* ((cand (collation-extend-key
|
|
|
368
|
+ best-key (aref cps k)))
|
|
|
369
|
+ (ces (gethash cand map)))
|
|
|
370
|
+ (if ces
|
|
|
371
|
+ ;; Fold C in and remove it; MAXCCC
|
|
|
372
|
+ ;; unchanged since C is no longer present.
|
|
|
373
|
+ (setf best-key cand
|
|
|
374
|
+ best-ces ces
|
|
|
375
|
+ (aref consumed k) t)
|
|
|
376
|
+ ;; No match: C stays and becomes a blocker.
|
|
|
377
|
+ (setf maxccc ccc))))
|
|
|
378
|
+ (t
|
|
|
379
|
+ ;; Blocked (ccc <= maxccc): C stays; MAXCCC
|
|
|
380
|
+ ;; already covers it.
|
|
|
381
|
+ nil)))))))
|
|
348
|
382
|
(loop for ce across best-ces do (push ce result))
|
|
349
|
383
|
(incf i best-len))
|
|
350
|
384
|
(t
|
|
351
|
385
|
;; No entry: derive implicit weights (Section 10.1).
|
|
352
|
|
- (dolist (ce (derive-implicit-elements d (aref cps i)))
|
|
|
386
|
+ (dolist (ce (derive-implicit-elements d cp))
|
|
353
|
387
|
(push ce result))
|
|
354
|
388
|
(incf i)))))))
|
|
355
|
389
|
(nreverse result)))
|
|
356
|
390
|
|
|
357
|
|
-(defun collation-weights (d string)
|
|
|
391
|
+(defun collation-weights (d string &optional (variable-weighting :shifted))
|
|
358
|
392
|
"Return four values -- the level-1, level-2, level-3 and level-4 weight
|
|
359
|
|
-lists for STRING under DUCET D, using the Shifted variable-weighting
|
|
360
|
|
-option (UTS #10 Section 4). Under Shifted: a variable element con-
|
|
361
|
|
-tributes nothing at levels 1-3 and its primary at level 4; a non-
|
|
362
|
|
-variable element takes a level-4 weight of FFFF when it carries level-2
|
|
363
|
|
-or level-3 content, but none when it is a primary-only continuation
|
|
364
|
|
-element ([.XXXX.0000.0000], e.g. the second half of an implicit weight
|
|
365
|
|
-pair or an expansion tail); a completely ignorable element
|
|
366
|
|
-([.0000.0000.0000]) contributes nothing anywhere; and a primary-
|
|
367
|
|
-ignorable element that follows a variable element is shifted away
|
|
368
|
|
-entirely."
|
|
|
393
|
+lists for STRING under DUCET D. VARIABLE-WEIGHTING selects the UTS #10
|
|
|
394
|
+Section 4 option:
|
|
|
395
|
+
|
|
|
396
|
+ :SHIFTED (the default) -- a variable element contributes nothing at
|
|
|
397
|
+ levels 1-3 and its primary at level 4; a non-variable element takes
|
|
|
398
|
+ a level-4 weight of FFFF when it carries level-2 or level-3 content,
|
|
|
399
|
+ but none when it is a primary-only continuation element
|
|
|
400
|
+ ([.XXXX.0000.0000], e.g. the second half of an implicit weight pair
|
|
|
401
|
+ or an expansion tail); a completely ignorable element
|
|
|
402
|
+ ([.0000.0000.0000]) contributes nothing anywhere; and a primary-
|
|
|
403
|
+ ignorable element that follows a variable element is shifted away
|
|
|
404
|
+ entirely.
|
|
|
405
|
+
|
|
|
406
|
+ :NON-IGNORABLE -- variable elements are not treated specially: every
|
|
|
407
|
+ element contributes its non-zero weights at levels 1-3 just like any
|
|
|
408
|
+ other element, and there is no fourth level (the returned L4 is
|
|
|
409
|
+ always NIL)."
|
|
|
410
|
+ (ecase variable-weighting
|
|
|
411
|
+ (:shifted (collation-weights-shifted d string))
|
|
|
412
|
+ (:non-ignorable (collation-weights-non-ignorable d string))))
|
|
|
413
|
+
|
|
|
414
|
+(defun collation-weights-shifted (d string)
|
|
|
415
|
+ "Compute the four weight levels for STRING under the Shifted option.
|
|
|
416
|
+See COLLATION-WEIGHTS."
|
|
369
|
417
|
(let ((ces (ducet-element-array d (string-to-collation-codepoints string)))
|
|
370
|
418
|
(l1 nil) (l2 nil) (l3 nil) (l4 nil)
|
|
371
|
419
|
(after-variable nil))
|
| ... |
... |
@@ -402,14 +450,35 @@ entirely." |
|
402
|
450
|
(setf after-variable nil)))))
|
|
403
|
451
|
(values (nreverse l1) (nreverse l2) (nreverse l3) (nreverse l4))))
|
|
404
|
452
|
|
|
405
|
|
-(defun collation-sort-key (d string)
|
|
406
|
|
- "Compute the UTS #10 sort key for STRING under DUCET D (Shifted
|
|
407
|
|
-option). Returns a (simple-array (unsigned-byte 16) (*)) holding the
|
|
408
|
|
-level-1 weights, a 0000 separator, the level-2 weights, 0000, the
|
|
409
|
|
-level-3 weights, 0000, and the level-4 weights. Binary comparison of
|
|
410
|
|
-two such keys yields the collation order of their strings."
|
|
411
|
|
- (multiple-value-bind (l1 l2 l3 l4) (collation-weights d string)
|
|
412
|
|
- (let* ((weights (append l1 (list 0) l2 (list 0) l3 (list 0) l4))
|
|
|
453
|
+(defun collation-weights-non-ignorable (d string)
|
|
|
454
|
+ "Compute the weight levels for STRING under the Non-ignorable option.
|
|
|
455
|
+Variable elements are treated exactly like ordinary elements, and there
|
|
|
456
|
+is no fourth level. See COLLATION-WEIGHTS."
|
|
|
457
|
+ (let ((ces (ducet-element-array d (string-to-collation-codepoints string)))
|
|
|
458
|
+ (l1 nil) (l2 nil) (l3 nil))
|
|
|
459
|
+ (dolist (ce ces)
|
|
|
460
|
+ (let ((p (ce-primary ce))
|
|
|
461
|
+ (s (ce-secondary ce))
|
|
|
462
|
+ (te (ce-tertiary ce)))
|
|
|
463
|
+ ;; Every element -- variable or not -- contributes its non-zero
|
|
|
464
|
+ ;; weights at each level; zero weights are passed over.
|
|
|
465
|
+ (unless (zerop p) (push p l1))
|
|
|
466
|
+ (unless (zerop s) (push s l2))
|
|
|
467
|
+ (unless (zerop te) (push te l3))))
|
|
|
468
|
+ (values (nreverse l1) (nreverse l2) (nreverse l3) nil)))
|
|
|
469
|
+
|
|
|
470
|
+(defun collation-sort-key (d string &optional (variable-weighting :shifted))
|
|
|
471
|
+ "Compute the UTS #10 sort key for STRING under DUCET D. Returns a
|
|
|
472
|
+(simple-array (unsigned-byte 16) (*)) holding the level-1 weights, a 0000
|
|
|
473
|
+separator, the level-2 weights, 0000, the level-3 weights, and -- for the
|
|
|
474
|
+:SHIFTED option -- a further 0000 separator and the level-4 weights.
|
|
|
475
|
+Binary comparison of two such keys yields the collation order of their
|
|
|
476
|
+strings. VARIABLE-WEIGHTING is as in COLLATION-WEIGHTS."
|
|
|
477
|
+ (multiple-value-bind (l1 l2 l3 l4)
|
|
|
478
|
+ (collation-weights d string variable-weighting)
|
|
|
479
|
+ (let* ((weights (if (eq variable-weighting :non-ignorable)
|
|
|
480
|
+ (append l1 (list 0) l2 (list 0) l3)
|
|
|
481
|
+ (append l1 (list 0) l2 (list 0) l3 (list 0) l4)))
|
|
413
|
482
|
(key (make-array (length weights)
|
|
414
|
483
|
:element-type '(unsigned-byte 16))))
|
|
415
|
484
|
(loop for w in weights
|
| ... |
... |
@@ -417,12 +486,12 @@ two such keys yields the collation order of their strings." |
|
417
|
486
|
do (setf (aref key k) w))
|
|
418
|
487
|
key)))
|
|
419
|
488
|
|
|
420
|
|
-(defun collation-compare (d s1 s2)
|
|
|
489
|
+(defun collation-compare (d s1 s2 &optional (variable-weighting :shifted))
|
|
421
|
490
|
"Compare strings S1 and S2 under DUCET D. Returns -1, 0, or 1 like a
|
|
422
|
491
|
three-way comparison: negative if S1 sorts before S2, zero if equal, 1
|
|
423
|
|
-if after."
|
|
424
|
|
- (let ((k1 (collation-sort-key d s1))
|
|
425
|
|
- (k2 (collation-sort-key d s2)))
|
|
|
492
|
+if after. VARIABLE-WEIGHTING is as in COLLATION-WEIGHTS."
|
|
|
493
|
+ (let ((k1 (collation-sort-key d s1 variable-weighting))
|
|
|
494
|
+ (k2 (collation-sort-key d s2 variable-weighting)))
|
|
426
|
495
|
(let ((n (min (length k1) (length k2))))
|
|
427
|
496
|
(dotimes (i n)
|
|
428
|
497
|
(let ((a (aref k1 i)) (b (aref k2 i)))
|