Raymond Toy pushed to branch master at cmucl / cmucl
Commits:
-
07a0ca81
by Raymond Toy at 2024-03-15T20:53:00+00:00
-
335588ce
by Raymond Toy at 2024-03-15T20:53:04+00:00
7 changed files:
- bin/run-ansi-tests.sh
- src/code/rand-xoroshiro.lisp
- src/compiler/x86/arith.lisp
- src/general-info/release-21f.md
- src/i18n/locale/cmucl.pot
- tests/rng.lisp
- + tests/rng/test-128-star-star.c
Changes:
... | ... | @@ -33,14 +33,15 @@ shift $[$OPTIND - 1] |
33 | 33 | |
34 | 34 | set -x
|
35 | 35 | if [ -d ../ansi-test ]; then
|
36 | - # We already have clone; make sure it's clean by stashing any changes.
|
|
37 | - (cd ../ansi-test; git stash)
|
|
36 | + # We already have clone; make sure it's clean by stashing any
|
|
37 | + # changes. Then pull any updates.
|
|
38 | + (cd ../ansi-test; git stash; git pull --rebase)
|
|
38 | 39 | else
|
39 | 40 | (cd ../; git clone https://gitlab.common-lisp.net/cmucl/ansi-test.git)
|
40 | 41 | fi
|
41 | 42 | |
42 | 43 | cd ../ansi-test
|
43 | -git checkout cmucl-expected-failures
|
|
44 | +git checkout issue-276-xoroshiro
|
|
44 | 45 | |
45 | 46 | make LISP="$LISP batch -noinit -nositeinit"
|
46 | 47 | # There should be no unexpected successes or failures; check these separately
|
... | ... | @@ -10,7 +10,7 @@ |
10 | 10 | ;;;
|
11 | 11 | ;;; **********************************************************************
|
12 | 12 | ;;;
|
13 | -;;; Support for the xoroshiro128+ random number generator by David
|
|
13 | +;;; Support for the xoroshiro128** random number generator by David
|
|
14 | 14 | ;;; Blackman and Sebastiano Vigna (vigna@acm.org). See
|
15 | 15 | ;;; http://xoroshiro.di.unimi.it/.
|
16 | 16 | |
... | ... | @@ -225,127 +225,143 @@ |
225 | 225 | |
226 | 226 | ;;;; Random entries:
|
227 | 227 | |
228 | -;; Sparc and x86 have vops to implement xoroshiro-gen that are much
|
|
229 | -;; faster than the portable lisp version. Use them.
|
|
230 | -#+(or x86 sparc)
|
|
228 | +;; X86 has a vop to implement xoroshiro-gen that is about 4.5 times
|
|
229 | +;; faster than the portable lisp version below. For other
|
|
230 | +;; architectures, we use the portable version until a vop is written.
|
|
231 | +#+x86
|
|
231 | 232 | (declaim (inline xoroshiro-gen))
|
232 | -#+(or x86 sparc)
|
|
233 | +#+x86
|
|
233 | 234 | (defun xoroshiro-gen (state)
|
235 | + _N"Generate the next 64-bit result from the xoroshiro128** generator
|
|
236 | + using the state in STATE, a simple-array of 2 double-floats. The
|
|
237 | + 64-bit result is returned as 2 32-bit values, with the high 32-bits
|
|
238 | + being the first value."
|
|
234 | 239 | (declare (type (simple-array double-float (2)) state)
|
235 | 240 | (optimize (speed 3) (safety 0)))
|
236 | 241 | (vm::xoroshiro-next state))
|
237 | 242 | |
238 | -#-(or x86 sparc)
|
|
243 | +#-x86
|
|
239 | 244 | (defun xoroshiro-gen (state)
|
245 | + _N"Generate the next 64-bit result from the xoroshiro128** generator
|
|
246 | + using the state in STATE, a simple-array of 2 double-floats. The
|
|
247 | + 64-bit result is returned as 2 32-bit values, with the high 32-bits
|
|
248 | + being the first value."
|
|
240 | 249 | (declare (type (simple-array double-float (2)) state)
|
241 | 250 | (optimize (speed 3) (safety 0)))
|
242 | - ;; Portable implementation of the xoroshiro128+ generator. See
|
|
243 | - ;; http://xoroshiro.di.unimi.it/xoroshiro128plus.c for the
|
|
244 | - ;; definitive definition.
|
|
245 | - ;;
|
|
246 | - ;; uint64_t s[2];
|
|
247 | - ;;
|
|
248 | - ;; static inline uint64_t rotl(const uint64_t x, int k) {
|
|
249 | - ;; return (x << k) | (x >> (64 - k));
|
|
250 | - ;; }
|
|
251 | - ;;
|
|
252 | - ;; uint64_t next(void) {
|
|
253 | - ;; const uint64_t s0 = s[0];
|
|
254 | - ;; uint64_t s1 = s[1];
|
|
255 | - ;; const uint64_t result = s0 + s1;
|
|
256 | - ;;
|
|
257 | - ;; s1 ^= s0;
|
|
258 | - ;; s[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14); // a, b
|
|
259 | - ;; s[1] = rotl(s1, 36); // c
|
|
260 | - ;;
|
|
261 | - ;; return result;
|
|
262 | - ;; }
|
|
263 | - ;;
|
|
264 | - (flet ((rotl-55 (x1 x0)
|
|
265 | - ;; Rotate [x1|x0] left 55 bits, returning the result as two
|
|
266 | - ;; values.
|
|
267 | - (declare (type (unsigned-byte 32) x0 x1)
|
|
268 | - (optimize (speed 3) (safety 0)))
|
|
269 | - ;; x << 55
|
|
270 | - (let ((sl55-h (ldb (byte 32 0) (ash x0 (- 55 32))))
|
|
271 | - (sl55-l 0))
|
|
272 | - ;; x >> 9
|
|
273 | - (let ((sr9-h (ash x1 -9))
|
|
274 | - (sr9-l (ldb (byte 32 0)
|
|
275 | - (logior (ash x0 -9)
|
|
276 | - (ash x1 23)))))
|
|
277 | - (values (logior sl55-h sr9-h)
|
|
278 | - (logior sl55-l sr9-l)))))
|
|
279 | - (rotl-36 (x1 x0)
|
|
280 | - ;; Rotate [x1|x0] left 36 bits, returning the result as two
|
|
281 | - ;; values.
|
|
282 | - (declare (type (unsigned-byte 32) x0 x1)
|
|
283 | - (optimize (speed 3) (safety 0)))
|
|
284 | - ;; x << 36
|
|
285 | - (let ((sl36-h (ldb (byte 32 0) (ash x0 4))))
|
|
286 | - ;; x >> 28
|
|
287 | - (let ((sr28-l (ldb (byte 32 0)
|
|
288 | - (logior (ash x0 -28)
|
|
289 | - (ash x1 4))))
|
|
290 | - (sr28-h (ash x1 -28)))
|
|
291 | - (values (logior sl36-h sr28-h)
|
|
292 | - sr28-l))))
|
|
293 | - (shl-14 (x1 x0)
|
|
294 | - ;; Shift [x1|x0] left by 14 bits, returning the result as
|
|
295 | - ;; two values.
|
|
296 | - (declare (type (unsigned-byte 32) x1 x0)
|
|
297 | - (optimize (speed 3) (safety 0)))
|
|
298 | - (values (ldb (byte 32 0)
|
|
299 | - (logior (ash x1 14)
|
|
300 | - (ash x0 (- 14 32))))
|
|
301 | - (ldb (byte 32 0)
|
|
302 | - (ash x0 14))))
|
|
303 | - (make-double (hi lo)
|
|
304 | - (kernel:make-double-float
|
|
305 | - (if (< hi #x80000000)
|
|
306 | - hi
|
|
307 | - (- hi #x100000000))
|
|
308 | - lo)))
|
|
251 | + (flet
|
|
252 | + ((make-double (hi lo)
|
|
253 | + ;; Convert [hi lo] to a double-float, where hi and lo are the
|
|
254 | + ;; high and low 32 bits of a 64-bit double-float.
|
|
255 | + (declare (type (unsigned-byte 32) hi lo))
|
|
256 | + (kernel:make-double-float
|
|
257 | + (if (< hi #x80000000)
|
|
258 | + hi
|
|
259 | + (- hi #x100000000))
|
|
260 | + lo))
|
|
261 | + (mul-shift (x1 x0 shift)
|
|
262 | + ;; Perform multiplication by 2^shift+1, by doing a shift and
|
|
263 | + ;; an add.
|
|
264 | + (declare (type (unsigned-byte 32) x1 x0)
|
|
265 | + (optimize (speed 3)))
|
|
266 | + (let* ((overflow (ash x0 (- (- 32 shift))))
|
|
267 | + (s0 (ldb (byte 32 0) (ash x0 shift)))
|
|
268 | + (s1 (logior (ldb (byte 32 0) (ash x1 shift))
|
|
269 | + overflow)))
|
|
270 | + (multiple-value-bind (sum0 carry)
|
|
271 | + (bignum:%add-with-carry s0 x0 0)
|
|
272 | + (multiple-value-bind (sum1)
|
|
273 | + (bignum:%add-with-carry s1 x1 carry)
|
|
274 | + (values sum1 sum0)))))
|
|
275 | + (shl (x1 x0 n)
|
|
276 | + ;; Shift left [x1 x0] by n bits, where 0 <= n < 31.
|
|
277 | + (declare (type (unsigned-byte 32) x1 x0)
|
|
278 | + (type (integer 0 31) n)
|
|
279 | + (optimize (speed 3)))
|
|
280 | + (let ((out (ldb (byte n (- 32 n)) x0)))
|
|
281 | + (values (logior (ldb (byte 32 0) (ash x1 n)) out)
|
|
282 | + (ldb (byte 32 0) (ash x0 n)))))
|
|
283 | + (rotl (x1 x0 n)
|
|
284 | + ;; Rotate left [x1 x0] by n bits where n < 32.
|
|
285 | + (declare (type (unsigned-byte 32) x1 x0)
|
|
286 | + (type (integer 0 31) n)
|
|
287 | + (optimize (speed 3)))
|
|
288 | + (let ((x1-hi (ldb (byte n (- 32 n)) x1))
|
|
289 | + (x0-hi (ldb (byte n (- 32 n)) x0)))
|
|
290 | + ;; x1-hi and x0-hi are the bits that are shifted out of x1
|
|
291 | + ;; and x0.
|
|
292 | + (values (logior (ldb (byte 32 0) (ash x1 n))
|
|
293 | + x0-hi)
|
|
294 | + (logior (ldb (byte 32 0) (ash x0 n))
|
|
295 | + x1-hi))))
|
|
296 | + (rotr (x1 x0 n)
|
|
297 | + ;; Rotate right [x1 x0] by n bits where n < 32.
|
|
298 | + (declare (type (unsigned-byte 32) x1 x0)
|
|
299 | + (type (integer 0 31) n)
|
|
300 | + (optimize (speed 3)))
|
|
301 | + (let ((x1-lo (ldb (byte n 0) x1))
|
|
302 | + (x0-lo (ldb (byte n 0) x0))
|
|
303 | + (-n (- n))
|
|
304 | + (32-n (- 32 n)))
|
|
305 | + ;; x1-lo and x0-lo are the bits of x1/x0 that would be
|
|
306 | + ;; shifted out.
|
|
307 | + (values (logior (ldb (byte 32 0) (ash x1 -n))
|
|
308 | + (ldb (byte 32 0) (ash x0-lo 32-n)))
|
|
309 | + (logior (ldb (byte 32 0) (ash x0 -n))
|
|
310 | + (ldb (byte 32 0) (ash x1-lo 32-n)))))))
|
|
311 | + (declare (inline mul-shift shl rotl rotr))
|
|
309 | 312 | (let ((s0-1 0)
|
310 | 313 | (s0-0 0)
|
311 | 314 | (s1-1 0)
|
312 | - (s1-0 0))
|
|
313 | - (declare (type (unsigned-byte 32) s0-1 s0-0 s1-1 s1-0))
|
|
314 | - ;; Load the state to s0 and s1. s0-1 is the high 32-bit part and
|
|
315 | - ;; s0-0 is the low 32-bit part of the 64-bit value. Similarly
|
|
316 | - ;; for s1.
|
|
315 | + (s1-0 0)
|
|
316 | + (result-1 0)
|
|
317 | + (result-0 0))
|
|
318 | + (declare (type (unsigned-byte 32) s0-1 s0-0 s1-1 s1-0 result-1 result-0))
|
|
319 | + ;; [s0-1 s0-0] is the first 64-bit word of the state.
|
|
317 | 320 | (multiple-value-bind (x1 x0)
|
318 | 321 | (kernel:double-float-bits (aref state 0))
|
319 | 322 | (setf s0-1 (ldb (byte 32 0) x1)
|
320 | 323 | s0-0 x0))
|
324 | + ;; [s1-1 s1-0] is the second 64-bit word of the state.
|
|
321 | 325 | (multiple-value-bind (x1 x0)
|
322 | 326 | (kernel:double-float-bits (aref state 1))
|
323 | 327 | (setf s1-1 (ldb (byte 32 0) x1)
|
324 | 328 | s1-0 x0))
|
325 | 329 | |
326 | - ;; Compute the 64-bit random value: s0 + s1
|
|
327 | - (multiple-value-prog1
|
|
328 | - (multiple-value-bind (sum-0 c)
|
|
329 | - (bignum::%add-with-carry s0-0 s1-0 0)
|
|
330 | - (values (bignum::%add-with-carry s0-1 s1-1 c)
|
|
331 | - sum-0))
|
|
332 | - ;; s1 ^= s0
|
|
333 | - (setf s1-1 (logxor s1-1 s0-1)
|
|
334 | - s1-0 (logxor s1-0 s0-0))
|
|
335 | - ;; s[0] = rotl(s0,55) ^ s1 ^ (s1 << 14)
|
|
336 | - (multiple-value-setq (s0-1 s0-0)
|
|
337 | - (rotl-55 s0-1 s0-0))
|
|
338 | - (setf s0-1 (logxor s0-1 s1-1)
|
|
339 | - s0-0 (logxor s0-0 s1-0))
|
|
340 | - (multiple-value-bind (s14-1 s14-0)
|
|
341 | - (shl-14 s1-1 s1-0)
|
|
342 | - (setf s0-1 (logxor s0-1 s14-1)
|
|
343 | - s0-0 (logxor s0-0 s14-0)))
|
|
344 | - |
|
345 | - (multiple-value-bind (r1 r0)
|
|
346 | - (rotl-36 s1-1 s1-0)
|
|
347 | - (setf (aref state 0) (make-double s0-1 s0-0)
|
|
348 | - (aref state 1) (make-double r1 r0)))))))
|
|
330 | + ;; [result-1 result-0] = s0 * 5
|
|
331 | + (multiple-value-setq (result-1 result-0)
|
|
332 | + (mul-shift s0-1 s0-0 2))
|
|
333 | + |
|
334 | + ;; [result-1 result-0] = rotl(result, 7)
|
|
335 | + (multiple-value-setq (result-1 result-0)
|
|
336 | + (rotl result-1 result-0 7))
|
|
337 | + |
|
338 | + ;; [result-1 result-0] = result*9
|
|
339 | + (multiple-value-setq (result-1 result-0)
|
|
340 | + (mul-shift result-1 result-0 3))
|
|
341 | + |
|
342 | + ;; s1 ^= s0
|
|
343 | + (setf s1-1 (logxor s1-1 s0-1)
|
|
344 | + s1-0 (logxor s1-0 s0-0))
|
|
345 | + (multiple-value-bind (s24-hi s24-lo)
|
|
346 | + (rotl s0-1 s0-0 24)
|
|
347 | + (declare (type (unsigned-byte 32) s24-hi s24-lo))
|
|
348 | + ;; [s24-hi s24-lo] = rotl(s0, 24) ^ s1
|
|
349 | + (setf s24-hi (logxor s24-hi s1-1)
|
|
350 | + s24-lo (logxor s24-lo s1-0))
|
|
351 | + (multiple-value-bind (s16-hi s16-lo)
|
|
352 | + (shl s1-1 s1-0 16)
|
|
353 | + ;; [s24-hi s24-lo] = s24 ^ (s1 << 16) = state[0]
|
|
354 | + (setf s24-hi (logxor s24-hi s16-hi)
|
|
355 | + s24-lo (logxor s24-lo s16-lo))
|
|
356 | + (setf (aref state 0) (make-double s24-hi s24-lo))))
|
|
357 | + (multiple-value-bind (s37-hi s37-lo)
|
|
358 | + (rotr s1-1 s1-0 (- 64 37))
|
|
359 | + ;; [s37-hi s37-lo] = rotl(s1, 37) = state[1]. But rotating
|
|
360 | + ;; left by 37 is the same as rotating right by 64-37.
|
|
361 | + (setf (aref state 1) (make-double s37-hi s37-lo)))
|
|
362 | + ;; Return result.
|
|
363 | + (values result-1
|
|
364 | + result-0))))
|
|
349 | 365 | |
350 | 366 | ;;; Size of the chunks returned by random-chunk.
|
351 | 367 | ;;;
|
... | ... | @@ -495,10 +511,10 @@ |
495 | 511 | :format-arguments (list arg)))))
|
496 | 512 | |
497 | 513 | ;; Jump function for the generator. See the jump function in
|
498 | -;; http://xoroshiro.di.unimi.it/xoroshiro128plus.c
|
|
514 | +;; https://prng.di.unimi.it/xoroshiro128starstar.c
|
|
499 | 515 | (defun random-state-jump (&optional (rng-state *random-state*))
|
500 | 516 | _N"Jump the RNG-STATE. This is equivalent to 2^64 calls to the
|
501 | - xoroshiro128+ generator. It can be used to generate 2^64
|
|
517 | + xoroshiro128** generator. It can be used to generate 2^64
|
|
502 | 518 | non-overlapping subsequences for parallel computations."
|
503 | 519 | (declare (type random-state rng-state))
|
504 | 520 | (let ((state (random-state-state rng-state))
|
... | ... | @@ -508,12 +524,12 @@ |
508 | 524 | (s1-1 0))
|
509 | 525 | (declare (type (unsigned-byte 32) s0-0 s0-1 s1-0 s1-1)
|
510 | 526 | (optimize (speed 3) (safety 0)))
|
511 | - ;; The constants are #xbeac0467eba5facb and #xd86b048b86aa9922,
|
|
527 | + ;; The constants are #xdf900294d8f554a5 and #x170865df4b3201fc,
|
|
512 | 528 | ;; and we process these numbers starting from the LSB. We want ot
|
513 | 529 | ;; process these in 32-bit chunks, so word-reverse the constants.
|
514 | - (dolist (jump '(#xeba5facb #xbeac0467 #x86aa9922 #xd86b048b))
|
|
515 | - (declare (type (unsigned-byte 32) jump))
|
|
516 | - (dotimes (b 32)
|
|
530 | + (dolist (jump '(#xdf900294d8f554a5 #x170865df4b3201fc))
|
|
531 | + (declare (type (unsigned-byte 64) jump))
|
|
532 | + (dotimes (b 64)
|
|
517 | 533 | (declare (fixnum b))
|
518 | 534 | (when (logbitp b jump)
|
519 | 535 | (multiple-value-bind (x1 x0)
|
... | ... | @@ -534,4 +550,4 @@ |
534 | 550 | x0)))
|
535 | 551 | (setf (aref state 0) (convert s0-1 s0-0))
|
536 | 552 | (setf (aref state 1) (convert s1-1 s1-0)))
|
537 | - rng-state)) |
|
553 | + rng-state)) |
... | ... | @@ -1695,58 +1695,101 @@ |
1695 | 1695 | (:temporary (:sc double-reg) s0)
|
1696 | 1696 | (:temporary (:sc double-reg) s1)
|
1697 | 1697 | (:temporary (:sc double-reg) t0)
|
1698 | + (:temporary (:sc double-reg) t1)
|
|
1698 | 1699 | (:generator 10
|
1700 | + ;; See https://prng.di.unimi.it/xoroshiro128starstar.c for the official code.
|
|
1701 | + ;;
|
|
1702 | + ;; This is what we're implementing, where s[] is our state vector.
|
|
1703 | + ;;
|
|
1704 | + ;; static uint64_t s[2];
|
|
1705 | + ;; static inline uint64_t rotl(const uint64_t x, int k) {
|
|
1706 | + ;; return (x << k) | (x >> (64 - k));
|
|
1707 | + ;; }
|
|
1708 | + ;;
|
|
1709 | + ;; uint64_t next(void) {
|
|
1710 | + ;; const uint64_t s0 = s[0];
|
|
1711 | + ;; uint64_t s1 = s[1];
|
|
1712 | + ;; const uint64_t result = rotl(s0 * 5, 7) * 9;
|
|
1713 | + ;;
|
|
1714 | + ;; s1 ^= s0;
|
|
1715 | + ;; s[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16); // a, b
|
|
1716 | + ;; s[1] = rotl(s1, 37); // c
|
|
1717 | + ;;
|
|
1718 | + ;; return result;
|
|
1719 | + ;; }
|
|
1720 | + |
|
1699 | 1721 | ;; s0 = state[0]
|
1700 | 1722 | (inst movsd s0 (make-ea :dword :base state
|
1701 | - :disp (- (+ (* vm:vector-data-offset
|
|
1702 | - vm:word-bytes)
|
|
1703 | - (* 8 0))
|
|
1704 | - vm:other-pointer-type)))
|
|
1705 | - ;; s1 = state[1]
|
|
1706 | - (inst movsd s1 (make-ea :dword :base state
|
|
1707 | - :disp (- (+ (* vm:vector-data-offset
|
|
1708 | - vm:word-bytes)
|
|
1709 | - (* 8 1))
|
|
1710 | - vm:other-pointer-type)))
|
|
1711 | - ;; Compute result = s0 + s1
|
|
1712 | - (inst movapd t0 s0)
|
|
1713 | - (inst paddq t0 s1)
|
|
1714 | - ;; Save the 64-bit result as two 32-bit results
|
|
1723 | + :disp (- (+ (* vm:vector-data-offset
|
|
1724 | + vm:word-bytes)
|
|
1725 | + (* 8 0))
|
|
1726 | + vm:other-pointer-type)))
|
|
1727 | + ;; t0 = s0 * 5 = s0 << 2 + s0
|
|
1728 | + (inst movapd t0 s0) ; t0 = s0
|
|
1729 | + (inst psllq t0 2) ; t0 = t0 << 2 = 4*t0
|
|
1730 | + (inst paddq t0 s0) ; t0 = t0 + s0 = 5*t0
|
|
1731 | + |
|
1732 | + ;; t0 = rotl(t0, 7) = t0 << 7 | t0 >> (64-7)
|
|
1733 | + ;; = rotl(s0*5, 7)
|
|
1734 | + (inst movapd t1 t0) ; t1 = t0
|
|
1735 | + (inst psllq t1 7) ; t1 = t0 << 7
|
|
1736 | + (inst psrlq t0 (- 64 7)) ; t0 = t0 >> 57
|
|
1737 | + (inst orpd t0 t1) ; t0 = t0 << 7 | t0 >> 57 = rotl(t0, 7)
|
|
1738 | + |
|
1739 | + ;; t0 = t0 * 9 = t0 << 3 + t0
|
|
1740 | + ;; = rotl(s0*5, 7) * 9
|
|
1741 | + (inst movapd t1 t0) ; t1 = t0
|
|
1742 | + (inst psllq t1 3) ; t1 = t0 << 3
|
|
1743 | + (inst paddq t0 t1) ; t0 = t0 << 3 + t0 = 9*t0
|
|
1744 | + |
|
1745 | + ;; Save the result as two 32-bit results. r1 is the high 32 bits
|
|
1746 | + ;; and r0 is the low 32.
|
|
1715 | 1747 | (inst movd r0 t0)
|
1716 | 1748 | (inst psrlq t0 32)
|
1717 | 1749 | (inst movd r1 t0)
|
1718 | 1750 | |
1719 | - ;; s1 = s1 ^ s0
|
|
1720 | - (inst xorpd s1 s0)
|
|
1721 | - |
|
1722 | - ;; s0 = rotl(s0,55) = s0 << 55 | s0 >> 9
|
|
1723 | - (inst movapd t0 s0)
|
|
1724 | - (inst psllq s0 55) ; s0 = s0 << 55
|
|
1725 | - (inst psrlq t0 9) ; t0 = s0 >> 9
|
|
1726 | - (inst orpd s0 t0) ; s0 = rotl(s0, 55)
|
|
1727 | - |
|
1728 | - (inst movapd t0 s1)
|
|
1729 | - (inst xorpd s0 s1) ; s0 = s0 ^ s1
|
|
1730 | - (inst psllq t0 14) ; t0 = s1 << 14
|
|
1731 | - (inst xorpd s0 t0) ; s0 = s0 ^ t0
|
|
1751 | + ;; s1 = state[1]
|
|
1752 | + (inst movsd s1 (make-ea :dword :base state
|
|
1753 | + :disp (- (+ (* vm:vector-data-offset
|
|
1754 | + vm:word-bytes)
|
|
1755 | + (* 8 1))
|
|
1756 | + vm:other-pointer-type)))
|
|
1757 | + (inst xorpd s1 s0) ; s1 = s1 ^ s0
|
|
1758 | + |
|
1759 | + ;; s0 can now be reused as a temp.
|
|
1760 | + ;; s0 = rotl(s0, 24)
|
|
1761 | + (inst movapd t0 s0) ; t0 = s0
|
|
1762 | + (inst psllq t0 24) ; t0 = s0 << 24
|
|
1763 | + (inst psrlq s0 (- 64 24)) ; s0 = s0 >> 40
|
|
1764 | + (inst orpd s0 t0) ; s0 = s0 | t0 = rotl(s0, 24)
|
|
1765 | + |
|
1766 | + ;; s0 = s0 ^ s1 = rotl(s0, 24) ^ s1
|
|
1767 | + (inst xorpd s0 s1)
|
|
1768 | + |
|
1769 | + ;; s0 = s0 ^ (s1 << 16)
|
|
1770 | + (inst movapd t0 s1) ; t0 = s1
|
|
1771 | + (inst psllq t0 16) ; t0 = s1 << 16
|
|
1772 | + (inst xorpd s0 t0) ; s0 = rotl(s0, 24) ^ s1 ^ (s1 << 16)
|
|
1773 | + |
|
1774 | + ;; Save s0 to state[0]
|
|
1732 | 1775 | (inst movsd (make-ea :dword :base state
|
1733 | 1776 | :disp (- (+ (* vm:vector-data-offset
|
1734 | 1777 | vm:word-bytes)
|
1735 | 1778 | (* 8 0))
|
1736 | 1779 | vm:other-pointer-type))
|
1737 | - s0)
|
|
1780 | + s0)
|
|
1738 | 1781 | |
1739 | - ;; s1 = rotl(s1, 36) = s1 << 36 | s1 >> 28, using t0 as temp
|
|
1740 | - (inst movapd t0 s1)
|
|
1741 | - (inst psllq s1 36)
|
|
1742 | - (inst psrlq t0 28)
|
|
1743 | - (inst orpd s1 t0)
|
|
1782 | + ;; s1 = rotl(s1, 37)
|
|
1783 | + (inst movapd t0 s1) ; t0 = s1
|
|
1784 | + (inst psllq t0 37) ; t0 = s1 << 37
|
|
1785 | + (inst psrlq s1 (- 64 37)) ; s1 = s1 >> 27
|
|
1786 | + (inst orpd s1 t0) ; s1 = t0 | s1 = rotl(s1, 37)
|
|
1744 | 1787 | |
1788 | + ;; Save s1 to state[1]
|
|
1745 | 1789 | (inst movsd (make-ea :dword :base state
|
1746 | 1790 | :disp (- (+ (* vm:vector-data-offset
|
1747 | 1791 | vm:word-bytes)
|
1748 | 1792 | (* 8 1))
|
1749 | 1793 | vm:other-pointer-type))
|
1750 | - s1)))
|
|
1751 | -)
|
|
1752 | - |
|
1794 | + s1)))
|
|
1795 | +) |
... | ... | @@ -24,6 +24,9 @@ public domain. |
24 | 24 | `ext:stream-file-length` generic function.
|
25 | 25 | * Changes:
|
26 | 26 | * Update to ASDF 3.3.7
|
27 | + * The RNG has changed from an old version of xoroshiro128+ to
|
|
28 | + xoroshiro128**. This means sequences of random numbers will be
|
|
29 | + different from before. See ~~#276~~.
|
|
27 | 30 | * ANSI compliance fixes:
|
28 | 31 | * Bug fixes:
|
29 | 32 | * Gitlab tickets:
|
... | ... | @@ -44,8 +47,7 @@ public domain. |
44 | 47 | * ~~#266~~ Support "~user" in namestrings
|
45 | 48 | * ~~#271~~ Update ASDF to 3.3.7
|
46 | 49 | * ~~#272~~ Move scavenge code for static vectors to its own function
|
47 | - * ~~#277~~ `float-ratio-float` returns least postive float for
|
|
48 | - ratios closer to that than zero.
|
|
50 | + * ~~#276~~ Implement xoroshiro128** generator for x86
|
|
49 | 51 | * Other changes:
|
50 | 52 | * Improvements to the PCL implementation of CLOS:
|
51 | 53 | * Changes to building procedure:
|
... | ... | @@ -12220,6 +12220,14 @@ msgstr "" |
12220 | 12220 | msgid "Argument is not a RANDOM-STATE, T, or NIL: ~S"
|
12221 | 12221 | msgstr ""
|
12222 | 12222 | |
12223 | +#: src/code/rand-xoroshiro.lisp
|
|
12224 | +msgid ""
|
|
12225 | +"Generate the next 64-bit result from the xoroshiro128** generator\n"
|
|
12226 | +" using the state in STATE, a simple-array of 2 double-floats. The\n"
|
|
12227 | +" 64-bit result is returned as 2 32-bit values, with the high 32-bits\n"
|
|
12228 | +" being the first value."
|
|
12229 | +msgstr ""
|
|
12230 | + |
|
12223 | 12231 | #: src/code/rand-xoroshiro.lisp
|
12224 | 12232 | msgid ""
|
12225 | 12233 | "Generate a uniformly distributed pseudo-random number between zero\n"
|
... | ... | @@ -12233,7 +12241,7 @@ msgstr "" |
12233 | 12241 | #: src/code/rand-xoroshiro.lisp
|
12234 | 12242 | msgid ""
|
12235 | 12243 | "Jump the RNG-STATE. This is equivalent to 2^64 calls to the\n"
|
12236 | -" xoroshiro128+ generator. It can be used to generate 2^64\n"
|
|
12244 | +" xoroshiro128** generator. It can be used to generate 2^64\n"
|
|
12237 | 12245 | " non-overlapping subsequences for parallel computations."
|
12238 | 12246 | msgstr ""
|
12239 | 12247 |
... | ... | @@ -48,16 +48,18 @@ |
48 | 48 | (assert-equal 0 (kernel::random-state-rand *test-state*))
|
49 | 49 | (assert-equal nil (kernel::random-state-cached-p *test-state*))
|
50 | 50 | |
51 | - (dolist (item '((#x18d5f05c086e0086 (#x228f4926843b364d #x74dfe78e715c81be))
|
|
52 | - (#x976f30b4f597b80b (#x5b6bd4558bd96a68 #x567b7f35650aea8f))
|
|
53 | - (#xb1e7538af0e454f7 (#x13e5253e242fac52 #xed380e70d10ab60e))
|
|
54 | - (#x011d33aef53a6260 (#x9d0764952ca00d8a #x5251a5cfedd2b4ef))
|
|
55 | - (#xef590a651a72c279 (#xba4ef2b425bda963 #x172b965cf56c15ac))
|
|
56 | - (#xd17a89111b29bf0f (#x458277a5e5f0a21b #xd1bccfad6564e8d))
|
|
57 | - (#x529e44a0bc46f0a8 (#x2becb68d5a7194c7 #x3a6ec964899bb5f3))
|
|
58 | - (#x665b7ff1e40d4aba (#xededfd481d0a19fe #x3ea213411827fe9d))
|
|
59 | - (#x2c9010893532189b (#xd7bb59bcd8fba26f #x52de763d34fee090))
|
|
60 | - (#x2a99cffa0dfa82ff (#xf96e892c62d6ff2e #xc0542ff85652f81e))))
|
|
51 | + (dolist (item
|
|
52 | + ;; Results for xoroshiro128**
|
|
53 | + '((#x41db14eb317141fe (#x16dfbf3d760d0fa4 #xe9bfcf1ce2b9037c))
|
|
54 | + (#xaa4ee6e025dfec01 (#xb237e99a3c7ad367 #x96819b1fec0e0432))
|
|
55 | + (#xea080e50cb948fa5 (#xcc0fd8226093e0bc #x0e9aeaa496ce50ba))
|
|
56 | + (#x647f057cff408a6e (#xd273573bfa97bfde #xcbb600d852a650de))
|
|
57 | + (#x232ac586565d037e (#x75dc686d99e39c57 #x063de00338aafc75))
|
|
58 | + (#xdf2da206813da6d6 (#x9616cabb961ebc4a #x292c044e7c310dd4))
|
|
59 | + (#x00d17cb1b38c852f (#xca593a661127a754 #x45f633d7e759debd))
|
|
60 | + (#xd7a1f881fc34e641 (#xe00fd868db5d20d3 #xcfcf3d31f5e1363e))
|
|
61 | + (#x64853747af628d30 (#xa24296c5ebb11935 #xd782dda5f81cab25))
|
|
62 | + (#xda40653710b7293d (#xfb4be9d4941ff086 #x75b6420eb8096c02))))
|
|
61 | 63 | (destructuring-bind (value state)
|
62 | 64 | item
|
63 | 65 | (assert-equal value (64-bit-value *test-state*))
|
... | ... | @@ -69,10 +71,13 @@ |
69 | 71 | (kernel::make-random-object :state (kernel::init-random-state #x12345678)
|
70 | 72 | :rand 0
|
71 | 73 | :cached-p nil))
|
72 | - (dolist (result '((#x291ddf8e6f6a7b67 #x1f9018a12f9e031f)
|
|
73 | - (#x88a7aa12158558d0 #xe264d785ab1472d9)
|
|
74 | - (#x207e16f73c51e7ba #x999c8a0a9a8d87c0)
|
|
75 | - (#x28f8959d3bcf5ff1 #x38091e563ab6eb98)))
|
|
74 | + (dolist (result
|
|
75 | + ;; Results for xoroshiro128** jump function
|
|
76 | + '((#x19a22191480b0a4e #x43b3d7ee592dd4cf)
|
|
77 | + (#x76cb87035d0b6e99 #xb6827bcf2ef8267c)
|
|
78 | + (#x5125201dbdf76860 #x8984c075043869e2)
|
|
79 | + (#x2c06f0667255309f #xa48cbe2e60fc1d65)
|
|
80 | + ))
|
|
76 | 81 | (kernel:random-state-jump *test-state*)
|
77 | 82 | (assert-equal result (multiple-value-list
|
78 | 83 | (64-bit-rng-state *test-state*)))))
|
1 | +/* Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)
|
|
2 | + |
|
3 | +To the extent possible under law, the author has dedicated all copyright
|
|
4 | +and related and neighboring rights to this software to the public domain
|
|
5 | +worldwide. This software is distributed without any warranty.
|
|
6 | + |
|
7 | +See <http://creativecommons.org/publicdomain/zero/1.0/>. */
|
|
8 | + |
|
9 | +#include <stdint.h>
|
|
10 | + |
|
11 | +/* This is xoroshiro128** 1.0, one of our all-purpose, rock-solid,
|
|
12 | + small-state generators. It is extremely (sub-ns) fast and it passes all
|
|
13 | + tests we are aware of, but its state space is large enough only for
|
|
14 | + mild parallelism.
|
|
15 | + |
|
16 | + For generating just floating-point numbers, xoroshiro128+ is even
|
|
17 | + faster (but it has a very mild bias, see notes in the comments).
|
|
18 | + |
|
19 | + The state must be seeded so that it is not everywhere zero. If you have
|
|
20 | + a 64-bit seed, we suggest to seed a splitmix64 generator and use its
|
|
21 | + output to fill s. */
|
|
22 | + |
|
23 | + |
|
24 | +static inline uint64_t rotl(const uint64_t x, int k) {
|
|
25 | + return (x << k) | (x >> (64 - k));
|
|
26 | +}
|
|
27 | + |
|
28 | + |
|
29 | +static uint64_t s[2];
|
|
30 | + |
|
31 | +uint64_t next(void) {
|
|
32 | + const uint64_t s0 = s[0];
|
|
33 | + uint64_t s1 = s[1];
|
|
34 | + const uint64_t result = rotl(s0 * 5, 7) * 9;
|
|
35 | + |
|
36 | + s1 ^= s0;
|
|
37 | + s[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16); // a, b
|
|
38 | + s[1] = rotl(s1, 37); // c
|
|
39 | + |
|
40 | + return result;
|
|
41 | +}
|
|
42 | + |
|
43 | + |
|
44 | +/* This is the jump function for the generator. It is equivalent
|
|
45 | + to 2^64 calls to next(); it can be used to generate 2^64
|
|
46 | + non-overlapping subsequences for parallel computations. */
|
|
47 | + |
|
48 | +void jump(void) {
|
|
49 | + static const uint64_t JUMP[] = { 0xdf900294d8f554a5, 0x170865df4b3201fc };
|
|
50 | + |
|
51 | + uint64_t s0 = 0;
|
|
52 | + uint64_t s1 = 0;
|
|
53 | + for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
|
|
54 | + for(int b = 0; b < 64; b++) {
|
|
55 | + if (JUMP[i] & UINT64_C(1) << b) {
|
|
56 | + s0 ^= s[0];
|
|
57 | + s1 ^= s[1];
|
|
58 | + }
|
|
59 | + next();
|
|
60 | + }
|
|
61 | + |
|
62 | + s[0] = s0;
|
|
63 | + s[1] = s1;
|
|
64 | +}
|
|
65 | + |
|
66 | + |
|
67 | +/* This is the long-jump function for the generator. It is equivalent to
|
|
68 | + 2^96 calls to next(); it can be used to generate 2^32 starting points,
|
|
69 | + from each of which jump() will generate 2^32 non-overlapping
|
|
70 | + subsequences for parallel distributed computations. */
|
|
71 | + |
|
72 | +void long_jump(void) {
|
|
73 | + static const uint64_t LONG_JUMP[] = { 0xd2a98b26625eee7b, 0xdddf9b1090aa7ac1 };
|
|
74 | + |
|
75 | + uint64_t s0 = 0;
|
|
76 | + uint64_t s1 = 0;
|
|
77 | + for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
|
|
78 | + for(int b = 0; b < 64; b++) {
|
|
79 | + if (LONG_JUMP[i] & UINT64_C(1) << b) {
|
|
80 | + s0 ^= s[0];
|
|
81 | + s1 ^= s[1];
|
|
82 | + }
|
|
83 | + next();
|
|
84 | + }
|
|
85 | + |
|
86 | + s[0] = s0;
|
|
87 | + s[1] = s1;
|
|
88 | +}
|
|
89 | + |
|
90 | +/*********************************************************************/
|
|
91 | + |
|
92 | +#include <stdio.h>
|
|
93 | + |
|
94 | +/*
|
|
95 | + * Print out the first 10 random numbers from the generator. This is
|
|
96 | + * used for the expected results in the rng unit test.
|
|
97 | + */
|
|
98 | +void
|
|
99 | +test_rng()
|
|
100 | +{
|
|
101 | + int k;
|
|
102 | + uint64_t r;
|
|
103 | +
|
|
104 | +
|
|
105 | + s[0] = 0x38f1dc39d1906b6full;
|
|
106 | + s[1] = 0xdfe4142236dd9517ull;
|
|
107 | + |
|
108 | + printf(";; First 10 outputs\n");
|
|
109 | + |
|
110 | + for (k = 0; k < 10; ++k) {
|
|
111 | + r = next();
|
|
112 | + printf("%2d: #x%016llx (#x%016llx #x%016llx)\n", k, r, s[0], s[1]);
|
|
113 | + }
|
|
114 | +}
|
|
115 | + |
|
116 | +/*
|
|
117 | + * Print out the first 4 states from jumping generator by 2^64 steps.
|
|
118 | + * This is used for the expected results in the rng unit test.
|
|
119 | + */
|
|
120 | +void
|
|
121 | +test_jump()
|
|
122 | +{
|
|
123 | + int k;
|
|
124 | + |
|
125 | + s[0] = 0x38f1dc39d1906b6full;
|
|
126 | + s[1] = 0xdfe4142236dd9517ull;
|
|
127 | + |
|
128 | + printf(";; First 4 jumps\n");
|
|
129 | + for (k = 0; k < 4; ++k) {
|
|
130 | + jump();
|
|
131 | + printf("%2d: #x%016llx #x%016llx\n", k, s[0], s[1]);
|
|
132 | + }
|
|
133 | +}
|
|
134 | + |
|
135 | + |
|
136 | +int
|
|
137 | +main()
|
|
138 | +{
|
|
139 | + test_rng();
|
|
140 | + test_jump();
|
|
141 | +
|
|
142 | + return 0;
|
|
143 | +} |