1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
|
;;; Assert that all odd divisors less than 32769 have an optimized
;;; remainder coefficient using 32-bit operations.
;;; Symbol hashsets with fewer than that many cells can use fast remainder.
;;; (I could/should implement the 64-bit algorithm I suppose)
(with-test (:name :optimized-remainder-exists)
(loop for divisor from 3 below 32769 by 2
do
(multiple-value-bind (mask c) (sb-impl::optimized-symtbl-remainder-params divisor)
(assert (and (plusp c) (plusp mask))))))
#+interpreter (invoke-restart 'run-tests::skip-file)
(defvar *rs* (make-random-state t)) ; get a random random-state
#+x86-64
(with-test (:name :udiv-magic)
(dotimes (i 1000)
(let ((divisor (+ 5 (random #xfffffff *rs*))))
(multiple-value-bind (m a s) (sb-c:compute-udiv32-magic divisor)
(dotimes (i 10000)
(let* ((dividend (random (ash 1 32) *rs*))
(quotient (sb-vm::udiv32-via-multiply dividend m a s))
(expected (truncate dividend divisor)))
(assert (= quotient expected))))))))
#+x86-64
(with-test (:name :urem-magic)
(dotimes (i 1000)
(let ((divisor (+ 5 (random #xfffffff *rs*))))
(multiple-value-bind (m a s) (sb-c:compute-udiv32-magic divisor)
(dotimes (i 10000)
(let* ((dividend (random (ash 1 32) *rs*))
(remainder (sb-vm::urem32-via-multiply dividend divisor m a s))
(expected (rem dividend divisor)))
(assert (= remainder expected))))))))
(with-test (:name :urem32-ultrafast)
(dotimes (i 1000)
;; The 32-bit algorithm can only handle random 16-bit inputs (or some inputs
;; with more than 16 bits, but it's random which inputs are OK)
;; A 64-bit variant would handle all 32-bit inputs, but I don't need it.
(let* ((divisor (+ 5 (random #xffff *rs*)))
(c (sb-c:compute-fastrem-coefficient divisor 16 32)))
(dotimes (i 20000)
(let* ((dividend (random (ash 1 16) *rs*))
(remainder (sb-vm::fastrem-32 dividend c divisor))
(expected (rem dividend divisor)))
(assert (= remainder expected)))))))
#+64-bit
(with-test (:name :urem64-ultrafast)
(dotimes (i 1000)
(let* ((divisor (+ 5 (random #xfffffff *rs*)))
(c (sb-c:compute-fastrem-coefficient divisor 32 64)))
(dotimes (i 20000)
(let* ((dividend (random (ash 1 32) *rs*))
(remainder (sb-vm::fastrem-64 dividend c divisor))
(expected (rem dividend divisor)))
(assert (= remainder expected)))))))
;;;; The assembly code for the benchmarks contains no full calls.
;;;; Therefore we're measuring the speed of the vops as accurately as possible.
;;; These loops show the advantage of division by using multiplication
;;; even when the CPU directly implements division.
;;; BASIC-WAY takes about 4 to 6 times as many CPU cycles as TRICKY-WAY.
;;; But these loops are too time-consuming for a regression test.
(defun div-basic-way (&aux (result 0))
(declare (fixnum result))
(loop for divisor from 3 to 1000
do (dotimes (dividend #xfffff)
(let ((ans (truncate dividend divisor)))
(setq result (logxor result ans)))))
result)
#+x86-64
(defun div-tricky-way (&aux (result 0))
(declare (fixnum result))
(loop for divisor from 3 to 1000
do (multiple-value-bind (m a s) (sb-c:compute-udiv32-magic divisor)
(dotimes (dividend #xfffff)
(let ((ans (sb-vm::udiv32-via-multiply dividend m a s)))
(setq result (logxor result ans))))))
result)
;;; * (time (rem-basic-way))
;;; Evaluation took:
;;; 30.123 seconds of real time
;;; 30.112920 seconds of total run time (30.112919 user, 0.000001 system)
;;; 99.97% CPU
;;; 84,142,880,093 processor cycles
(defun rem-basic-way (&aux (result 0)) ; about 32 CPU cycles per operation
(declare (fixnum result))
(loop for divisor from 3 to 10000
do (dotimes (dividend #x3ffff)
(let ((ans (rem dividend divisor)))
(setq result (logxor result ans)))))
result)
;;; * (time (rem-tricky-way))
;;; Evaluation took:
;;; 9.999 seconds of real time
;;; 9.998409 seconds of total run time (9.994458 user, 0.003951 system)
;;; 99.99% CPU
;;; 27,938,111,662 processor cycles
#+x86-64
(defun rem-tricky-way (&aux (result 0)) ; about 11 CPU cycles per operation
(declare (fixnum result))
(loop for divisor from 3 to 10000
do (multiple-value-bind (m a s) (sb-c:compute-udiv32-magic divisor)
(dotimes (dividend #x3ffff)
(let ((ans (sb-vm::urem32-via-multiply dividend divisor m a s)))
(setq result (logxor result ans))))))
result)
;;; * (time (ultrafastrem-way))
;;; Evaluation took:
;;; 6.175 seconds of real time
;;; 6.171059 seconds of total run time (6.171059 user, 0.000000 system)
;;; 99.94% CPU
;;; 17,243,611,816 processor cycles
(defun ultrafastrem-way (&aux (result 0)) ; about 7 CPU cycles per operation
(declare (fixnum result))
(loop for divisor from 3 to 10000
do (let ((c (sb-c:compute-fastrem-coefficient divisor 18 32)))
(dotimes (dividend #x3ffff)
(let ((ans (sb-vm::fastrem-32 dividend c divisor)))
(setq result (logxor result ans))))))
result)
;;; This is the generalized code for the ultrafast way, which isn't actually
;;; ultrafast, but it shows the correctness of the algorithm at various precisions.
(defun try-fastrem (divisor nbits) ;; nbits = number of bits in numerator
(macrolet ((try (fraction-bits)
`(multiple-value-bind (fraction-bits c)
(sb-c:compute-fastrem-coefficient divisor nbits ,fraction-bits)
;;(format t "~&F=~D~%" fraction-bits)
(dotimes (numerator (ash 1 nbits)) ; exhaustively test all possible numerators
(let* ((frac (ldb (byte fraction-bits 0) (* c numerator)))
(answer (ash (* frac divisor) (- fraction-bits)))
(expect (mod numerator divisor)))
(unless (= answer expect)
(format t "~12,'0b ~12,'0b ~3d ~3d~%"
numerator frac answer (mod numerator divisor))))))))
(try :variable)
;(try 16)
(try 32)))
(defun try-fastrem-bigly ()
(loop for divisor from 3 to 10000
do (format t "divisor=~d~%" divisor)
(try-fastrem divisor 18)))
|