Personal tools

Benchmarks Game/Parallel/SpectralNorm

From HaskellWiki

< Benchmarks Game | Parallel
Revision as of 17:22, 21 September 2008 by Olsner (Talk | contribs)

Jump to: navigation, search

Parallel versions of spectral-norm

1 Slightly nicer

{-# LANGUAGE BangPatterns, MagicHash #-}
{-# OPTIONS -fvia-C -fexcess-precision #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Translation from Clean by Don Stewart
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
--                           2) changed -optc-O to -optc-O3
--                           3) added -optc-ffast-math
-- Parallelised by Don Stewart
--
-- Should be compiled with:
--      -O2 -threaded --make
--      -optc-O2 -optc-march=native -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
--
 
import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Internal
 
import GHC.Base
import GHC.Float
import GHC.Int
 
import Control.Concurrent
import Control.Exception
 
type Reals = Ptr Double
 
main = do
    n <- getArgs >>= readIO . head
 
    u <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff u i 1
    v <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff v i 0
 
    powerMethod 10 n u v
    printf "%.9f\n" (eigenvalue n u v 0 0 0)
 
------------------------------------------------------------------------
 
eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
    | i < n     = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
    | otherwise = sqrt $! vBv / vv
    where
       ui = inlinePerformIO (peekElemOff u i)
       vi = inlinePerformIO (peekElemOff v i)
 
------------------------------------------------------------------------
 
powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod i n u v =
    mapM_ takeMVar =<< replicateM i ( do me <- newEmptyMVar;
                                         forkIO (child `finally` putMVar me ());
                                         return me )
   where
    child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u
 
{-
powerMethod i n u v = do
    -- roll our own synchronisation
    children <- newMVar []
    replicateM_ i $ do
        me <- newEmptyMVar
        cs <- takeMVar children
        putMVar children (me : cs)
        forkIO (child `finally` putMVar me ())
    wait children
  where
    child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u
-}
 
{-
    -- wait on children
    wait box = do
      cs <- takeMVar box
      case cs of
        []   -> return ()
        m:ms -> putMVar box ms >> takeMVar m >> wait box
-}  
 
-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
    timesAv  n u t
    timesAtv n t atau
 
timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff au i (avsum i 0 0)
        go (i+1)
 
    avsum :: Int -> Int -> Double -> Double
    avsum !i !j !acc
        | j < n = avsum i (j+1) (acc + ((aij i j) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)
 
timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff a i (atvsum i 0 0)
        go (i+1)
 
    atvsum :: Int -> Int -> Double -> Double
    atvsum !i !j !acc
        | j < n     = atvsum i (j+1) (acc + ((aij j i) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)
 
--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
    case i +# j of
        n -> case n *# (n+#1#) of
                t -> case t `uncheckedIShiftRA#` 1# of
                        u -> case u +# (i +# 1#) of
                                r -> 1.0## /## (int2Double# r))

2 Easy parallelisation

Takes the existing single core spectral-norm Haskell entry, and parallelises the main loop using explicit concurrency based on forkIO and MVars.

Should give reasonable cpu utilisation, and overall speedup.

Compiling:

   $ ghc C.hs -O2 -optc-O3 -optc-march=pentium4 -optc-mfpmath=sse
   -optc-msse2 -optc-ffast-math -threaded --make

(That is, pretty much the same flags, but with -threaded) Running:

   $ time ./C 5500 +RTS -N5
   1.274224153
   ./C 5500 +RTS -N5  12.57s user 0.01s system 339% cpu 3.708 total

(Just add +RTS -N5 -RTS) Using 5 capabilities (to hide latency). Compared to current sequential entry,

   $ time ./A 5500
   1.274224153
   ./A 5500  12.56s user 0.01s system 99% cpu
   12.586 total

The code:

{-# LANGUAGE BangPatterns, MagicHash #-}
{-# OPTIONS -fvia-C -fexcess-precision #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Translation from Clean by Don Stewart
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
--                           2) changed -optc-O to -optc-O3
--                           3) added -optc-ffast-math
-- Parallelised by Don Stewart
--
-- Should be compiled with:
--      -O2 -threaded --make
--      -optc-O2 -optc-march=native -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
--
 
import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Internal
 
import GHC.Base
import GHC.Float
import GHC.Int
 
import Control.Concurrent
import Control.Exception
 
type Reals = Ptr Double
 
main = do
    n <- getArgs >>= readIO . head
 
    u <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff u i 1
    v <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff v i 0
 
    powerMethod 10 n u v
    printf "%.9f\n" (eigenvalue n u v 0 0 0)
 
------------------------------------------------------------------------
 
eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
    | i < n     = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
    | otherwise = sqrt $! vBv / vv
    where
       ui = inlinePerformIO (peekElemOff u i)
       vi = inlinePerformIO (peekElemOff v i)
 
------------------------------------------------------------------------
 
powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod i n u v = do
    -- roll our own synchronisation
    children <- newMVar []
    replicateM_ i $ do
        me <- newEmptyMVar
        cs <- takeMVar children
        putMVar children (me : cs)
        forkIO (child `finally` putMVar me ())
    wait children
  where
    child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u
 
    -- wait on children
    wait box = do
      cs <- takeMVar box
      case cs of
        []   -> return ()
        m:ms -> putMVar box ms >> takeMVar m >> wait box
 
-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
    timesAv  n u t
    timesAtv n t atau
 
timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff au i (avsum i 0 0)
        go (i+1)
 
    avsum :: Int -> Int -> Double -> Double
    avsum !i !j !acc
        | j < n = avsum i (j+1) (acc + ((aij i j) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)
 
timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff a i (atvsum i 0 0)
        go (i+1)
 
    atvsum :: Int -> Int -> Double -> Double
    atvsum !i !j !acc
        | j < n     = atvsum i (j+1) (acc + ((aij j i) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)
 
--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
    case i +# j of
        n -> case n *# (n+#1#) of
                t -> case t `uncheckedIShiftRA#` 1# of
                        u -> case u +# (i +# 1#) of
                                r -> 1.0## /## (int2Double# r))

3 Actually correct entry

Based on the STUArray solution from Shootout/Spectral, converted to IOUArray for use of forkIO. (Only needs a forkAndWaitST implementation...)

Note that the benchmark requires synchronization between each iteration. This solution slices the output array into n parts and spawns one child per slice, waiting for every child to complete before continuing. (In theory, it's inefficient to repeatedly fork and synchronize like this, but it doesn't seem to cost that much.)

{-# OPTIONS_GHC -fglasgow-exts #-}
 
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Contributed by Simon Brenner
--
-- Compile with:
--
-- C-options:
--    -fvia-c -optc-O3 -optc-march=native -optc-mfpmath=sse -optc-msse2
--    -optc-ffast-math
-- GHC options:
--    -O2 -fexcess-precision -threaded
 
import Control.Concurrent
import Control.Concurrent.MVar
import Control.Exception
 
import Control.Monad.ST
import Control.Monad
 
import Data.Array.IO
import Data.Array.ST
import Data.Ix
import Data.Bits
 
import System.Environment
import Text.Printf
 
-- Various magic used for the unboxed version of the 'a' function
import GHC.Base
import GHC.Float
 
main = do
    n <- fmap (read . head) getArgs
    -- Use this variant for STUArray rather than IOUArray
    --print (spectralNorm n)
    printf "%.9f" =<< spectralNormM forkAndWaitIO allocDoubleIO n
 
-- STUArray version not parallelised, so we're IO-only for now...
--spectralNorm :: Int -> Double
--spectralNorm n = runST (spectralNormM forkAndWaitST allocDouble n)
 
--------------------------------------------------------------------------
-- Benchmark code
 
caps = 4
 
spectralNormM :: (forall x. (x -> m ()) -> [x] -> m ()) -> Allocator m Int Double -> Int -> m Double
spectralNormM forkAndWait allocator n = case allocator of
  (Allocator alloc) -> do
    let ubound = n-1
    u <- alloc (0,ubound)
    v <- alloc (0,ubound)
    tmp <- alloc (0,ubound)
 
    fill u 1.0
 
    replicateM_ 10 $ do
        multiplyAtAv forkAndWait 0 ubound u tmp v
        multiplyAtAv forkAndWait 0 ubound v tmp u
 
    vBv <- u .| v
    vv <- v .| v
 
    return $ sqrt (vBv / vv)
 
forkChild child = do
    m <- newEmptyMVar
    forkIO $ (child `finally` putMVar m ())
    return m
 
waitForChildren = mapM_ takeMVar
 
forkAndWaitIO f xs = waitForChildren =<< mapM (forkChild . f) xs
-- For a serialized version that doesn't fork (but works in any monad)
--forkAndWait f xs = mapM_ f xs
 
multiplyAtAv forkAndWait lb ub v u atav = do
    let forkChildren f = forkAndWait f (ranges lb caps ub)
    forkChildren (multiplyAv v u)
    forkChildren (multiplyAtv u atav)
 
multiplyAv v av (l,u) = forM_ [l..u] $ \i -> do
    writeArray av i 0.0
    iterateArray v $ \j -> do
        v' <- readArray v j
        modArray av i (+ ((a i j) * v'))
 
multiplyAtv v atv (l,u) = forM_ [l..u] $ \i -> do
    writeArray atv i 0.0
    iterateArray v $ \j -> do
        v' <- readArray v j
        modArray atv i (+ ((a j i) * v'))
 
a :: Int -> Int -> Double
--a i j = 1.0 / fromIntegral (((i + j) * (i + j + 1)) `shiftR` 1 + i + 1)
--a i j = 1.0 / ((i' + j') * ((i' + j' + 1) / 2) + i' + 1)
--  where i' = fromIntegral i; j' = fromIntegral j
 
a (I# i) (I# j) = D# (
    case i +# j of
        n -> case n *# (n+#1#) of
            t -> case t `uncheckedIShiftRA#` 1# of
                u -> case u +# (i +# 1#) of
                    r -> 1.0## /## (int2Double# r))
 
--------------------------------------------------------------------------
-- General utility code
 
fill u x = getBounds u >>= \(lb,ub) -> forM_ [lb..ub] (\i -> writeArray u i x)
 
u .| v = getBounds u >>= \(lb,ub) -> go lb ub u v 0
    where
        go lb ub u v acc
            | lb <= ub = do
                u' <- readArray u lb
                v' <- readArray v lb
                go (lb+1) ub u v (acc + u'*v')
            | otherwise = return acc
 
-- Inline pragma: ~103x speedup :)
{-# INLINE modArray #-}
modArray a i f = writeArray a i . f =<< readArray a i
 
-- Inline pragma: ~18x speedup
{-# INLINE iterateArray #-}
iterateArray arr f = getBounds arr >>= \(lb,ub) -> forM_ [lb..ub] f
 
ranges lb n ub = go lb ((ub - lb) `div` n) ub
    where
        go x n y | x < y = (x, min (x+n) y) : go (x+n+1) n y
        go _ _ _ = []
 
data Allocator m i e = forall a. MArray a e m => 
                       Allocator ((i, i) -> m (a i e))
 
allocDouble :: Ix i => Allocator (ST s) i Double
allocDouble = Allocator $
    (newArray_ :: Ix i => (i,i) -> ST s (STUArray s i Double))
 
allocDoubleIO :: Ix i => Allocator IO i Double
allocDoubleIO = Allocator $
    (newArray_ :: Ix i => (i,i) -> IO (IOUArray i Double))