Shootout/Spectral
From HaskellWiki
A Shootout Entry for the spectral norm test
Contents |
1 Timings
Debian Linux x86, n=2,500
||Entry || Time|| ||dons || 9.407|| ||old || 10.564||
2 Current entry
Damn fast.
{-# OPTIONS -fvia-C -fexcess-precision #-} -- -- The Computer Language Benchmarks Game -- http://shootout.alioth.debian.org/ -- -- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA# -- 2) changed -optc-O to -optc-O3 -- 3) added -optc-ffast-math -- Translation from Clean by Don Stewart -- -- Should be compiled with: -- -O -fglasgow-exts -fbang-patterns -- -optc-O3 -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 -optc-ffast-math -- -- Note: The following flags appear to INCREASE running time: -- -O2 -optc-funroll-loops import System import Foreign.Marshal.Array import Foreign import Text.Printf import Control.Monad import Data.ByteString.Internal import GHC.Base import GHC.Float import GHC.Int type Reals = Ptr Double main = do n <- getArgs >>= readIO . head u <- mallocArray n :: IO Reals forM_ [0..n-1] $ \i -> pokeElemOff u i 1 v <- mallocArray n :: IO Reals forM_ [0..n-1] $ \i -> pokeElemOff v i 0 powerMethod 10 n u v printf "%.9f\n" (eigenvalue n u v 0 0 0) ------------------------------------------------------------------------ eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double eigenvalue !n !u !v !i !vBv !vv | i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi) | otherwise = sqrt $! vBv / vv where ui = inlinePerformIO (peekElemOff u i) vi = inlinePerformIO (peekElemOff v i) ------------------------------------------------------------------------ powerMethod :: Int -> Int -> Reals -> Reals -> IO () powerMethod !i !n !u !v = allocaArray n $ \t -> replicateM_ i $ timesAtAv t n u v >> timesAtAv t n v u -- multiply vector v by matrix A and then by matrix A transposed timesAtAv :: Reals -> Int -> Reals -> Reals -> IO () timesAtAv !t !n !u !atau = do timesAv n u t timesAtv n t atau timesAv :: Int -> Reals -> Reals -> IO () timesAv !n !u !au = go 0 where go :: Int -> IO () go !i = when (i < n) $ do pokeElemOff au i (avsum i 0 0) go (i+1) avsum :: Int -> Int -> Double -> Double avsum !i !j !acc | j < n = avsum i (j+1) (acc + ((aij i j) * uj)) | otherwise = acc where uj = inlinePerformIO (peekElemOff u j) timesAtv :: Int -> Reals -> Reals -> IO () timesAtv !n !u !a = go 0 where go :: Int -> IO () go !i = when (i < n) $ do pokeElemOff a i (atvsum i 0 0) go (i+1) atvsum :: Int -> Int -> Double -> Double atvsum !i !j !acc | j < n = atvsum i (j+1) (acc + ((aij j i) * uj)) | otherwise = acc where uj = inlinePerformIO (peekElemOff u j) -- -- manually unbox the inner loop: -- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1) -- aij (I# i) (I# j) = D# ( case i +# j of n -> case n *# (n+#1#) of t -> case t `uncheckedIShiftRA#` 1# of u -> case u +# (i +# 1#) of r -> 1.0## /## (int2Double# r))
3 Proposed entry
Uses Ptr Double, and carefully unboxes the aij inner computation (where 70% of the time is spent). Runs 2.3x slower than C on my p4.
{-# OPTIONS -fexcess-precision #-} -- -- The Computer Language Shootout -- http://shootout.alioth.debian.org/ -- -- Translation from Clean by Don Stewart -- -- Should be compiled with: -- -O -fglasgow-exts -fbang-patterns -- -optc-O -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 -- import System import Foreign.Marshal.Array import Foreign import Text.Printf import Control.Monad import Data.ByteString.Base import GHC.Base import GHC.Float import GHC.Int type Reals = Ptr Double main = do n <- getArgs >>= readIO . head u <- mallocArray n :: IO Reals forM_ [0..n-1] $ \i -> pokeElemOff u i 1 v <- mallocArray n :: IO Reals forM_ [0..n-1] $ \i -> pokeElemOff v i 0 powerMethod 10 n u v printf "%.9f\n" (eigenvalue n u v 0 0 0) ------------------------------------------------------------------------ eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double eigenvalue !n !u !v !i !vBv !vv | i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi) | otherwise = sqrt $! vBv / vv where ui = inlinePerformIO (peekElemOff u i) vi = inlinePerformIO (peekElemOff v i) ------------------------------------------------------------------------ powerMethod :: Int -> Int -> Reals -> Reals -> IO () powerMethod !i !n !u !v = allocaArray n $ \t -> replicateM_ i $ timesAtAv t n u v >> timesAtAv t n v u -- multiply vector v by matrix A and then by matrix A transposed timesAtAv :: Reals -> Int -> Reals -> Reals -> IO () timesAtAv !t !n !u !atau = do timesAv n u t timesAtv n t atau {-# INLINE timesAtAv #-} timesAv :: Int -> Reals -> Reals -> IO () timesAv !n !u !au = go 0 where go :: Int -> IO () go !i = when (i < n) $ do pokeElemOff au i (avsum i 0 0) go (i+1) avsum :: Int -> Int -> Double -> Double avsum !i !j !acc | j < n = avsum i (j+1) (acc + ((aij i j) * uj)) | otherwise = acc where uj = inlinePerformIO (peekElemOff u j) {-# INLINE timesAv #-} timesAtv :: Int -> Reals -> Reals -> IO () timesAtv !n !u !a = go 0 where go :: Int -> IO () go !i = when (i < n) $ do pokeElemOff a i (atvsum i 0 0) go (i+1) atvsum :: Int -> Int -> Double -> Double atvsum !i !j !acc | j < n = atvsum i (j+1) (acc + ((aij j i) * uj)) | otherwise = acc where uj = inlinePerformIO (peekElemOff u j) {-# INLINE timesAtv #-} -- -- manually unbox the inner loop: -- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1) -- aij (I# i) (I# j) = D# ( case i +# j of n -> case n *# (n+#1#) of t -> case divInt# t 2# of u -> case u +# (i +# 1#) of r -> 1.0## /## (int2Double# r))
4 Current entry
Better gcc flags. Bang patterns. Submitted
Didn't do so well in practice though :/
{-# OPTIONS -O -fglasgow-exts -fbang-patterns -funbox-strict-fields -fexcess-precision -optc-O2 -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 #-} -- -- The Great Computer Language Shootout -- http:--shootout.alioth.debian.org/ -- -- Contributed by Don Stewart -- import Monad import System import Text.Printf import Data.Array.IO import Data.Array.Base main = getArgs >>= approximate . read . head >>= printf "%.9f\n" approximate n = do u <- newArray (0,n-1) 1 :: IO (IOUArray Int Double) v <- newArray (0,n-1) 0 :: IO (IOUArray Int Double) sequence_ $ replicate 10 $ multiplyAtAv n u v >> multiplyAtAv n v u let loop !vbv !vv !i | i >= n = return (vbv,vv) | otherwise = do ui <- unsafeRead u i vi <- unsafeRead v i loop (vbv + ui * vi) (vv + vi * vi) (i+1) (vbv,vv) <- loop 0 0 0 return $! sqrt (vbv/vv) -- return element i,j of infinite matrix A a i j = 1 / fromIntegral (x*(x+1) `div` 2 + i + 1) where x = i+j -- multiply vector v by matrix A */ multiplyAv !n !v !av = loop 0 where loop i = when (i < n) $ do avi <- loop' i 0 0 unsafeWrite av i avi >> loop (i+1) loop' !i !j !av | j >= n = return av | otherwise = do vj <- unsafeRead v j loop' i (j+1) (av + a i j * vj) -- multiply vector v by matrix A transposed multiplyAtv !n !v !atv = loop 0 where loop i = when (i < n) $ do atvi <- loop' i 0 0 unsafeWrite atv i atvi >> loop (i+1) loop' !i !j !atvi | j >= n = return atvi | otherwise = do vj <- unsafeRead v j loop' i (j+1) (atvi + a j i * vj) -- multiply vector v by matrix A and then by matrix A transposed */ multiplyAtAv !n !v !atav = do u <- newArray (0,n-1) 0 :: IO (IOUArray Int Double) multiplyAv n v u >> multiplyAtv n u atav
5 Old entry
GHC unboxed this better. Careful attention payed to the unboxing. -O2 -optc-O -optc-ffast-math -fexcess-precision
{-# OPTIONS -optc-O #-} -- -- The Great Computer Language Shootout -- http:--shootout.alioth.debian.org/ -- -- Contributed by Don Stewart -- -- gcc miscompiles this program with -O3 -- import Monad import System import Numeric import Data.Array.IO import Data.Array.Base main = getArgs >>= approximate . read . head >>= putStrLn . \s -> showFFloat (Just 9) s [] approximate n = do u <- newArray (0,n-1) 1 :: IO (IOUArray Int Double) v <- newArray_ (0,n-1) :: IO (IOUArray Int Double) sequence_ $ replicate 10 $ multiplyAtAv n u v >> multiplyAtAv n v u loop 0 0 0 n u v loop vbv vv i n u v | vbv `seq` vv `seq` i `seq` n `seq` u `seq` v `seq` False = undefined loop vbv vv i n u v | i >= n = return $! sqrt (vbv/vv) | otherwise = do ui <- unsafeRead u i vi <- unsafeRead v i loop (vbv + ui * vi) (vv + vi * vi) (i+1) n u v -- return element i,j of infinite matrix A a i j | i `seq` j `seq` False = undefined a i j = 1 / fromIntegral (x*(x+1) `div` 2 + i + 1) where x = i+j -- multiply vector v by matrix A */ multiplyAv n v av | n `seq` v `seq` av `seq` False = undefined multiplyAv n v av = loop 0 where loop i = when (i < n) $ loop' i 0 0 >>= unsafeWrite av i >> loop (i+1) loop' i j av | i `seq` j `seq` av `seq` False = undefined loop' i j av | j >= n = return av | otherwise = do vj <- v `unsafeRead` j loop' i (j+1) (av + a i j * vj) -- multiply vector v by matrix A transposed multiplyAtv n v atv | n `seq` v `seq` atv `seq` False = undefined multiplyAtv n v atv = loop 0 where loop i = when (i < n) $ loop' i 0 0 >>= unsafeWrite atv i >> loop (i+1) loop' i j atvi | j `seq` atvi `seq` False = undefined loop' i j atvi | j >= n = return atvi | otherwise = do vj <- v `unsafeRead` j loop' i (j+1) (atvi + a j i * vj) -- multiply vector v by matrix A and then by matrix A transposed */ multiplyAtAv n v atav | n `seq` v `seq` atav `seq` False = undefined multiplyAtAv n v atav = do u <- newArray_ (0,n-1) :: IO (IOUArray Int Double) multiplyAv n v u >> multiplyAtv n u atav
6 Current entry
-- The Great Computer Language Shootout -- http:--shootout.alioth.debian.org/ -- -- Original C contributed by Sebastien Loisel -- Conversion to C++ by Jon Harrop -- Conversion to Haskell by Einar Karttunen import Control.Monad.ST import Data.Array.Base import Data.Array.ST import Numeric import System eval_A :: Int -> Int -> Double eval_A i j = 1 / fromIntegral ((i+j)*(i+j+1) `div` 2 + i + 1) plusAt :: STUArray s Int Double -> Int -> Double -> ST s () plusAt a i v = do o <- unsafeRead a i unsafeWrite a i (v+o) eval_A_Times_u :: STUArray s Int Double -> STUArray s Int Double -> ST s () eval_A_Times_u u au = outer (snd $ bounds u) where outer 0 = unsafeWrite au 0 0 >> inner 0 (snd $ bounds u) outer i = unsafeWrite au i 0 >> inner i (snd $ bounds u) >> outer (i-1) inner i 0 = unsafeRead u 0 >>= \uj -> plusAt au i (eval_A i 0 * uj) inner i j = unsafeRead u j >>= \uj -> plusAt au i (eval_A i j * uj) >> inner i (j-1) eval_At_Times_u :: STUArray s Int Double -> STUArray s Int Double -> ST s () eval_At_Times_u u au = outer (snd $ bounds u) where outer 0 = unsafeWrite au 0 0 >> inner 0 (snd $ bounds u) outer i = unsafeWrite au i 0 >> inner i (snd $ bounds u) >> outer (i-1) inner i 0 = unsafeRead u 0 >>= \uj -> plusAt au i (eval_A 0 i * uj) inner i j = unsafeRead u j >>= \uj -> plusAt au i (eval_A j i * uj) >> inner i (j-1) eval_AtA_Times_u u v = do w <- newArray (bounds u) 0 eval_A_Times_u u w >> eval_At_Times_u w v main = do n <- getArgs >>= return.read.head let (vBv,vv) = runST (do u <- newArray (0,n-1) 1 v <- newArray (0,n-1) 0 sequence_ $ replicate 10 (eval_AtA_Times_u u v >> eval_AtA_Times_u v u) vLoop u v n (0, 0)) putStrLn $ showFFloat (Just 9) (sqrt (vBv/vv)) "" vLoop :: STUArray s Int Double -> STUArray s Int Double -> Int -> (Double,Double) -> ST s (Double,Double) vLoop u v 0 a = return a vLoop u v (i+1) (vBv,vv) = vLoop u v i =<< op where op = do ui <- unsafeRead u i vi <- unsafeRead v i return (vBv+(ui*vi),vv+(vi*vi))
7 STUArray-based solution
One-line-change convertible to IOUArray, which seems to be exactly as fast as STUArray. The important part seems to be the unboxed version of the 'a' function and the iterateM_ hack (which I don't quite know how it works at all).
Otherwise, it seems to be just about exactly as fast as the unsafe Ptr Double versions.
{-# OPTIONS_GHC -fglasgow-exts #-} -- -- The Computer Language Benchmarks Game -- http://shootout.alioth.debian.org/ -- -- Contributed by Simon Brenner -- -- Compile with: -- -- C-options: -- -fvia-c -optc-O3 -optc-march=native -optc-mfpmath=sse -optc-msse2 -- -optc-ffast-math -- GHC options: -- -O2 -fexcess-precision import Control.Monad.ST import Control.Monad import Data.Array.IO import Data.Array.ST import Data.Ix import Data.Bits import System.Environment -- Various magic used for the unboxed version of the 'a' function import GHC.Base import GHC.Float main = do n <- fmap (read . head) getArgs print (spectralNorm n) -- Use this variant for IOUArray rather than STUArray --print =<< spectralNormM allocDoubleIO n spectralNorm :: Int -> Double spectralNorm n = runST (spectralNormM allocDouble n) ----------------- -- Benchmark code spectralNormM allocator n = case allocator of (Allocator alloc) -> do let ubound = n-1 u <- alloc (0,ubound) v <- alloc (0,ubound) tmp <- alloc (0,ubound) fill u 1.0 replicateM_ 10 $ do multiplyAtAv ubound u tmp v multiplyAtAv ubound v tmp u vBv <- u .| v vv <- v .| v return $ sqrt (vBv / vv) multiplyAtAv n v u atav = do multiplyAv n v u multiplyAtv n u atav multiplyAv n v av = iterateM_ 0 n $ \i -> do writeArray av i 0.0 iterateM_ 0 n $ \j -> do v' <- readArray v j modArray av i (+ ((a i j) * v')) multiplyAtv n v atv = iterateM_ 0 n $ \i -> do writeArray atv i 0.0 iterateM_ 0 n $ \j -> do v' <- readArray v j modArray atv i (+ ((a j i) * v')) a :: Int -> Int -> Double --a i j = 1.0 / fromIntegral (((i + j) * (i + j + 1)) `shiftR` 1 + i + 1) --a i j = 1.0 / (((i' + j') * (i' + j' + 1) / 2) + i' + 1) -- where i' = fromIntegral i; j' = fromIntegral j a (I# i) (I# j) = D# ( case i +# j of n -> case n *# (n+#1#) of t -> case t `uncheckedIShiftRA#` 1# of u -> case u +# (i +# 1#) of r -> 1.0## /## (int2Double# r)) ----------------------- -- General utility code fill u x = getBounds u >>= \(lb,ub) -> forM_ [lb..ub] (\i -> writeArray u i x) u .| v = getBounds u >>= \(lb,ub) -> go lb ub u v 0 where go lb ub u v acc | lb <= ub = do u' <- readArray u lb v' <- readArray v lb go (lb+1) ub u v (acc + u'*v') | otherwise = return acc -- Explicitly inlining this seems to stand for about a 24x speedup :) {-# INLINE modArray #-} modArray a i f = writeArray a i . f =<< readArray a i -- This is really weird! Replacing iterateM_ x y with forM_ [x..y] in the loops -- above makes the program go twice as slowly. (Really!) {-# INLINE iterateM_ #-} {-# RULES "inline-iterateM_" iterateM_ = \lb ub f -> forM_ [lb..ub] f #-} iterateM_ lb ub = forM_ [lb..ub] data Allocator m i e = forall a. MArray a e m => Allocator ((i, i) -> m (a i e)) allocDouble :: Ix i => Allocator (ST s) i Double allocDouble = Allocator $ (newArray_ :: Ix i => (i,i) -> ST s (STUArray s i Double)) allocDoubleIO :: Ix i => Allocator IO i Double allocDoubleIO = Allocator $ (newArray_ :: Ix i => (i,i) -> IO (IOUArray i Double))
