Benchmarks Game/Parallel/SpectralNorm
From HaskellWiki
Parallel versions of spectral-norm
1 Slightly nicer
{-# LANGUAGE BangPatterns, MagicHash #-} {-# OPTIONS -fvia-C -fexcess-precision #-} -- -- The Computer Language Benchmarks Game -- http://shootout.alioth.debian.org/ -- -- Translation from Clean by Don Stewart -- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA# -- 2) changed -optc-O to -optc-O3 -- 3) added -optc-ffast-math -- Parallelised by Don Stewart -- -- Should be compiled with: -- -O2 -threaded --make -- -optc-O2 -optc-march=native -optc-mfpmath=sse -optc-msse2 -optc-ffast-math -- import System import Foreign.Marshal.Array import Foreign import Text.Printf import Control.Monad import Data.ByteString.Internal import GHC.Base import GHC.Float import GHC.Int import Control.Concurrent import Control.Exception type Reals = Ptr Double main = do n <- getArgs >>= readIO . head u <- mallocArray n :: IO Reals forM_ [0..n-1] $ \i -> pokeElemOff u i 1 v <- mallocArray n :: IO Reals forM_ [0..n-1] $ \i -> pokeElemOff v i 0 powerMethod 10 n u v printf "%.9f\n" (eigenvalue n u v 0 0 0) ------------------------------------------------------------------------ eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double eigenvalue !n !u !v !i !vBv !vv | i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi) | otherwise = sqrt $! vBv / vv where ui = inlinePerformIO (peekElemOff u i) vi = inlinePerformIO (peekElemOff v i) ------------------------------------------------------------------------ powerMethod :: Int -> Int -> Reals -> Reals -> IO () powerMethod i n u v = mapM_ takeMVar =<< replicateM i ( do me <- newEmptyMVar; forkIO (child `finally` putMVar me ()); return me ) where child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u {- powerMethod i n u v = do -- roll our own synchronisation children <- newMVar [] replicateM_ i $ do me <- newEmptyMVar cs <- takeMVar children putMVar children (me : cs) forkIO (child `finally` putMVar me ()) wait children where child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u -} {- -- wait on children wait box = do cs <- takeMVar box case cs of [] -> return () m:ms -> putMVar box ms >> takeMVar m >> wait box -} -- multiply vector v by matrix A and then by matrix A transposed timesAtAv :: Reals -> Int -> Reals -> Reals -> IO () timesAtAv !t !n !u !atau = do timesAv n u t timesAtv n t atau timesAv :: Int -> Reals -> Reals -> IO () timesAv !n !u !au = go 0 where go :: Int -> IO () go !i = when (i < n) $ do pokeElemOff au i (avsum i 0 0) go (i+1) avsum :: Int -> Int -> Double -> Double avsum !i !j !acc | j < n = avsum i (j+1) (acc + ((aij i j) * uj)) | otherwise = acc where uj = inlinePerformIO (peekElemOff u j) timesAtv :: Int -> Reals -> Reals -> IO () timesAtv !n !u !a = go 0 where go :: Int -> IO () go !i = when (i < n) $ do pokeElemOff a i (atvsum i 0 0) go (i+1) atvsum :: Int -> Int -> Double -> Double atvsum !i !j !acc | j < n = atvsum i (j+1) (acc + ((aij j i) * uj)) | otherwise = acc where uj = inlinePerformIO (peekElemOff u j) -- -- manually unbox the inner loop: -- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1) -- aij (I# i) (I# j) = D# ( case i +# j of n -> case n *# (n+#1#) of t -> case t `uncheckedIShiftRA#` 1# of u -> case u +# (i +# 1#) of r -> 1.0## /## (int2Double# r))
2 Easy parallelisation
Takes the existing single core spectral-norm Haskell entry, and parallelises the main loop using explicit concurrency based on forkIO and MVars.
Should give reasonable cpu utilisation, and overall speedup.
Compiling:
$ ghc C.hs -O2 -optc-O3 -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 -optc-ffast-math -threaded --make
(That is, pretty much the same flags, but with -threaded) Running:
$ time ./C 5500 +RTS -N5 1.274224153 ./C 5500 +RTS -N5 12.57s user 0.01s system 339% cpu 3.708 total
(Just add +RTS -N5 -RTS) Using 5 capabilities (to hide latency). Compared to current sequential entry,
$ time ./A 5500 1.274224153 ./A 5500 12.56s user 0.01s system 99% cpu 12.586 total
The code:
{-# LANGUAGE BangPatterns, MagicHash #-} {-# OPTIONS -fvia-C -fexcess-precision #-} -- -- The Computer Language Benchmarks Game -- http://shootout.alioth.debian.org/ -- -- Translation from Clean by Don Stewart -- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA# -- 2) changed -optc-O to -optc-O3 -- 3) added -optc-ffast-math -- Parallelised by Don Stewart -- -- Should be compiled with: -- -O2 -threaded --make -- -optc-O2 -optc-march=native -optc-mfpmath=sse -optc-msse2 -optc-ffast-math -- import System import Foreign.Marshal.Array import Foreign import Text.Printf import Control.Monad import Data.ByteString.Internal import GHC.Base import GHC.Float import GHC.Int import Control.Concurrent import Control.Exception type Reals = Ptr Double main = do n <- getArgs >>= readIO . head u <- mallocArray n :: IO Reals forM_ [0..n-1] $ \i -> pokeElemOff u i 1 v <- mallocArray n :: IO Reals forM_ [0..n-1] $ \i -> pokeElemOff v i 0 powerMethod 10 n u v printf "%.9f\n" (eigenvalue n u v 0 0 0) ------------------------------------------------------------------------ eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double eigenvalue !n !u !v !i !vBv !vv | i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi) | otherwise = sqrt $! vBv / vv where ui = inlinePerformIO (peekElemOff u i) vi = inlinePerformIO (peekElemOff v i) ------------------------------------------------------------------------ powerMethod :: Int -> Int -> Reals -> Reals -> IO () powerMethod i n u v = do -- roll our own synchronisation children <- newMVar [] replicateM_ i $ do me <- newEmptyMVar cs <- takeMVar children putMVar children (me : cs) forkIO (child `finally` putMVar me ()) wait children where child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u -- wait on children wait box = do cs <- takeMVar box case cs of [] -> return () m:ms -> putMVar box ms >> takeMVar m >> wait box -- multiply vector v by matrix A and then by matrix A transposed timesAtAv :: Reals -> Int -> Reals -> Reals -> IO () timesAtAv !t !n !u !atau = do timesAv n u t timesAtv n t atau timesAv :: Int -> Reals -> Reals -> IO () timesAv !n !u !au = go 0 where go :: Int -> IO () go !i = when (i < n) $ do pokeElemOff au i (avsum i 0 0) go (i+1) avsum :: Int -> Int -> Double -> Double avsum !i !j !acc | j < n = avsum i (j+1) (acc + ((aij i j) * uj)) | otherwise = acc where uj = inlinePerformIO (peekElemOff u j) timesAtv :: Int -> Reals -> Reals -> IO () timesAtv !n !u !a = go 0 where go :: Int -> IO () go !i = when (i < n) $ do pokeElemOff a i (atvsum i 0 0) go (i+1) atvsum :: Int -> Int -> Double -> Double atvsum !i !j !acc | j < n = atvsum i (j+1) (acc + ((aij j i) * uj)) | otherwise = acc where uj = inlinePerformIO (peekElemOff u j) -- -- manually unbox the inner loop: -- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1) -- aij (I# i) (I# j) = D# ( case i +# j of n -> case n *# (n+#1#) of t -> case t `uncheckedIShiftRA#` 1# of u -> case u +# (i +# 1#) of r -> 1.0## /## (int2Double# r))
3 ST/IOUArray-based solution
Based on the STUArray solution from Shootout/Spectral, converted to IOUArray for use of forkIO. (For use with ST it would require a forkAndWaitST implementation...)
Note that the benchmark requires synchronization between each iteration. This solution slices the output array into n parts and spawns one child per slice, waiting for every child to complete before continuing. (In theory, it's inefficient to repeatedly fork and synchronize like this, but it doesn't seem to cost that much.)
{-# OPTIONS_GHC -fglasgow-exts #-} -- -- The Computer Language Benchmarks Game -- http://shootout.alioth.debian.org/ -- -- Contributed by Simon Brenner -- -- Compile with: -- -- C-options: -- -fvia-c -optc-O3 -optc-march=native -optc-mfpmath=sse -optc-msse2 -- -optc-ffast-math -- GHC options: -- -O2 -fexcess-precision -threaded import Control.Concurrent import Control.Concurrent.MVar import Control.Exception import Control.Monad.ST import Control.Monad import Data.Array.IO import Data.Array.ST import Data.Ix import Data.Bits import System.Environment import Text.Printf -- Various magic used for the unboxed version of the 'a' function import GHC.Base import GHC.Float main = do n <- fmap (read . head) getArgs --print (spectralNorm n) printf "%.9f" =<< spectralNormM forkAndWaitIO allocDoubleIO n -- STUArray version not parallelised, so we're IO-only for now... --spectralNorm :: Int -> Double --spectralNorm n = runST (spectralNormM forkAndWaitST allocDouble n) -------------------------------------------------------------------------- -- Benchmark code caps = 4 spectralNormM :: (forall x. (x -> m ()) -> [x] -> m ()) -> Allocator m Int Double -> Int -> m Double spectralNormM forkAndWait allocator n = case allocator of (Allocator alloc) -> do let ubound = n-1 u <- alloc (0,ubound) v <- alloc (0,ubound) tmp <- alloc (0,ubound) fill u 1.0 replicateM_ 10 $ do multiplyAtAv forkAndWait 0 ubound u tmp v multiplyAtAv forkAndWait 0 ubound v tmp u vBv <- u .| v vv <- v .| v return $ sqrt (vBv / vv) forkChild child = do m <- newEmptyMVar forkIO $ (child `finally` putMVar m ()) return m waitForChildren = mapM_ takeMVar forkAndWaitIO f xs = waitForChildren =<< mapM (forkChild . f) xs -- For a serialized version that doesn't fork (but works in any monad) --forkAndWait f xs = mapM_ f xs multiplyAtAv forkAndWait lb ub v u atav = do let forkChildren f = forkAndWait f (ranges lb caps ub) forkChildren (multiplyAv v u) forkChildren (multiplyAtv u atav) multiplyAv v av (l,u) = forM_ [l..u] $ \i -> do writeArray av i 0.0 iterateArray v $ \j -> do v' <- readArray v j modArray av i (+ ((a i j) * v')) multiplyAtv v atv (l,u) = forM_ [l..u] $ \i -> do writeArray atv i 0.0 iterateArray v $ \j -> do v' <- readArray v j modArray atv i (+ ((a j i) * v')) a :: Int -> Int -> Double --a i j = 1.0 / fromIntegral (((i + j) * (i + j + 1)) `shiftR` 1 + i + 1) --a i j = 1.0 / ((i' + j') * ((i' + j' + 1) / 2) + i' + 1) -- where i' = fromIntegral i; j' = fromIntegral j a (I# i) (I# j) = D# ( case i +# j of n -> case n *# (n+#1#) of t -> case t `uncheckedIShiftRA#` 1# of u -> case u +# (i +# 1#) of r -> 1.0## /## (int2Double# r)) -------------------------------------------------------------------------- -- General utility code fill u x = getBounds u >>= \(lb,ub) -> forM_ [lb..ub] (\i -> writeArray u i x) u .| v = getBounds u >>= \(lb,ub) -> go lb ub u v 0 where go lb ub u v acc | lb <= ub = do u' <- readArray u lb v' <- readArray v lb go (lb+1) ub u v (acc + u'*v') | otherwise = return acc -- Inline pragma: ~103x speedup :) {-# INLINE modArray #-} modArray a i f = writeArray a i . f =<< readArray a i -- Inline pragma: ~18x speedup {-# INLINE iterateArray #-} iterateArray arr f = getBounds arr >>= \(lb,ub) -> forM_ [lb..ub] f ranges lb n ub = go lb ((ub - lb) `div` n) ub where go x n y | x < y = (x, min (x+n) y) : go (x+n+1) n y go _ _ _ = [] -- Ugly hack to allow polymorphically creating arrays in both ST and IO (required by ST's phantom 's' type) data Allocator m i e = forall a. MArray a e m => Allocator ((i, i) -> m (a i e)) allocDouble :: Ix i => Allocator (ST s) i Double allocDouble = Allocator $ (newArray_ :: Ix i => (i,i) -> ST s (STUArray s i Double)) allocDoubleIO :: Ix i => Allocator IO i Double allocDoubleIO = Allocator $ (newArray_ :: Ix i => (i,i) -> IO (IOUArray i Double))
