Personal tools

Benchmarks Game/Parallel/SpectralNorm

From HaskellWiki

< Benchmarks Game | Parallel(Difference between revisions)
Jump to: navigation, search
Line 1: Line 1:
 
Parallel version of [http://shootout.alioth.debian.org/u64q/benchmark.php?test=spectralnorm&lang=all spectral-norm]
 
Parallel version of [http://shootout.alioth.debian.org/u64q/benchmark.php?test=spectralnorm&lang=all spectral-norm]
   
Easy parallelisation.
+
== Slightly nicer ==
  +
  +
<haskell>
  +
{-# LANGUAGE BangPatterns, MagicHash #-}
  +
{-# OPTIONS -fvia-C -fexcess-precision #-}
  +
--
  +
-- The Computer Language Benchmarks Game
  +
-- http://shootout.alioth.debian.org/
  +
--
  +
-- Translation from Clean by Don Stewart
  +
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
  +
-- 2) changed -optc-O to -optc-O3
  +
-- 3) added -optc-ffast-math
  +
-- Parallelised by Don Stewart
  +
--
  +
-- Should be compiled with:
  +
-- -O2 -threaded --make
  +
-- -optc-O2 -optc-march=native -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
  +
--
  +
  +
import System
  +
import Foreign.Marshal.Array
  +
import Foreign
  +
import Text.Printf
  +
import Control.Monad
  +
import Data.ByteString.Internal
  +
  +
import GHC.Base
  +
import GHC.Float
  +
import GHC.Int
  +
  +
import Control.Concurrent
  +
import Control.Exception
  +
  +
type Reals = Ptr Double
  +
  +
main = do
  +
n <- getArgs >>= readIO . head
  +
  +
u <- mallocArray n :: IO Reals
  +
forM_ [0..n-1] $ \i -> pokeElemOff u i 1
  +
v <- mallocArray n :: IO Reals
  +
forM_ [0..n-1] $ \i -> pokeElemOff v i 0
  +
  +
powerMethod 10 n u v
  +
printf "%.9f\n" (eigenvalue n u v 0 0 0)
  +
  +
------------------------------------------------------------------------
  +
  +
eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
  +
eigenvalue !n !u !v !i !vBv !vv
  +
| i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
  +
| otherwise = sqrt $! vBv / vv
  +
where
  +
ui = inlinePerformIO (peekElemOff u i)
  +
vi = inlinePerformIO (peekElemOff v i)
  +
  +
------------------------------------------------------------------------
  +
  +
powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
  +
powerMethod i n u v =
  +
mapM_ takeMVar =<< replicateM i ( do me <- newEmptyMVar;
  +
forkIO (child `finally` putMVar me ());
  +
return me )
  +
where
  +
child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u
  +
  +
{-
  +
powerMethod i n u v = do
  +
-- roll our own synchronisation
  +
children <- newMVar []
  +
replicateM_ i $ do
  +
me <- newEmptyMVar
  +
cs <- takeMVar children
  +
putMVar children (me : cs)
  +
forkIO (child `finally` putMVar me ())
  +
wait children
  +
where
  +
child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u
  +
-}
  +
  +
{-
  +
-- wait on children
  +
wait box = do
  +
cs <- takeMVar box
  +
case cs of
  +
[] -> return ()
  +
m:ms -> putMVar box ms >> takeMVar m >> wait box
  +
-}
  +
  +
-- multiply vector v by matrix A and then by matrix A transposed
  +
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
  +
timesAtAv !t !n !u !atau = do
  +
timesAv n u t
  +
timesAtv n t atau
  +
  +
timesAv :: Int -> Reals -> Reals -> IO ()
  +
timesAv !n !u !au = go 0
  +
where
  +
go :: Int -> IO ()
  +
go !i = when (i < n) $ do
  +
pokeElemOff au i (avsum i 0 0)
  +
go (i+1)
  +
  +
avsum :: Int -> Int -> Double -> Double
  +
avsum !i !j !acc
  +
| j < n = avsum i (j+1) (acc + ((aij i j) * uj))
  +
| otherwise = acc
  +
where uj = inlinePerformIO (peekElemOff u j)
  +
  +
timesAtv :: Int -> Reals -> Reals -> IO ()
  +
timesAtv !n !u !a = go 0
  +
where
  +
go :: Int -> IO ()
  +
go !i = when (i < n) $ do
  +
pokeElemOff a i (atvsum i 0 0)
  +
go (i+1)
  +
  +
atvsum :: Int -> Int -> Double -> Double
  +
atvsum !i !j !acc
  +
| j < n = atvsum i (j+1) (acc + ((aij j i) * uj))
  +
| otherwise = acc
  +
where uj = inlinePerformIO (peekElemOff u j)
  +
  +
--
  +
-- manually unbox the inner loop:
  +
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
  +
--
  +
aij (I# i) (I# j) = D# (
  +
case i +# j of
  +
n -> case n *# (n+#1#) of
  +
t -> case t `uncheckedIShiftRA#` 1# of
  +
u -> case u +# (i +# 1#) of
  +
r -> 1.0## /## (int2Double# r))
  +
</haskell>
  +
  +
== Easy parallelisation ==
   
 
Takes the existing single core spectral-norm Haskell entry, and
 
Takes the existing single core spectral-norm Haskell entry, and
Line 28: Line 28:
 
./A 5500 12.56s user 0.01s system 99% cpu
 
./A 5500 12.56s user 0.01s system 99% cpu
 
12.586 total
 
12.586 total
  +
  +
The code:
   
 
<haskell>
 
<haskell>

Revision as of 20:33, 20 September 2008

Parallel version of spectral-norm

1 Slightly nicer

{-# LANGUAGE BangPatterns, MagicHash #-}
{-# OPTIONS -fvia-C -fexcess-precision #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Translation from Clean by Don Stewart
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
--                           2) changed -optc-O to -optc-O3
--                           3) added -optc-ffast-math
-- Parallelised by Don Stewart
--
-- Should be compiled with:
--      -O2 -threaded --make
--      -optc-O2 -optc-march=native -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
--
 
import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Internal
 
import GHC.Base
import GHC.Float
import GHC.Int
 
import Control.Concurrent
import Control.Exception
 
type Reals = Ptr Double
 
main = do
    n <- getArgs >>= readIO . head
 
    u <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff u i 1
    v <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff v i 0
 
    powerMethod 10 n u v
    printf "%.9f\n" (eigenvalue n u v 0 0 0)
 
------------------------------------------------------------------------
 
eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
    | i < n     = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
    | otherwise = sqrt $! vBv / vv
    where
       ui = inlinePerformIO (peekElemOff u i)
       vi = inlinePerformIO (peekElemOff v i)
 
------------------------------------------------------------------------
 
powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod i n u v =
    mapM_ takeMVar =<< replicateM i ( do me <- newEmptyMVar;
                                         forkIO (child `finally` putMVar me ());
                                         return me )
   where
    child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u
 
{-
powerMethod i n u v = do
    -- roll our own synchronisation
    children <- newMVar []
    replicateM_ i $ do
        me <- newEmptyMVar
        cs <- takeMVar children
        putMVar children (me : cs)
        forkIO (child `finally` putMVar me ())
    wait children
  where
    child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u
-}
 
{-
    -- wait on children
    wait box = do
      cs <- takeMVar box
      case cs of
        []   -> return ()
        m:ms -> putMVar box ms >> takeMVar m >> wait box
-}  
 
-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
    timesAv  n u t
    timesAtv n t atau
 
timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff au i (avsum i 0 0)
        go (i+1)
 
    avsum :: Int -> Int -> Double -> Double
    avsum !i !j !acc
        | j < n = avsum i (j+1) (acc + ((aij i j) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)
 
timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff a i (atvsum i 0 0)
        go (i+1)
 
    atvsum :: Int -> Int -> Double -> Double
    atvsum !i !j !acc
        | j < n     = atvsum i (j+1) (acc + ((aij j i) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)
 
--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
    case i +# j of
        n -> case n *# (n+#1#) of
                t -> case t `uncheckedIShiftRA#` 1# of
                        u -> case u +# (i +# 1#) of
                                r -> 1.0## /## (int2Double# r))

2 Easy parallelisation

Takes the existing single core spectral-norm Haskell entry, and parallelises the main loop using explicit concurrency based on forkIO and MVars.

Should give reasonable cpu utilisation, and overall speedup.

Compiling:

   $ ghc C.hs -O2 -optc-O3 -optc-march=pentium4 -optc-mfpmath=sse
   -optc-msse2 -optc-ffast-math -threaded --make

(That is, pretty much the same flags, but with -threaded) Running:

   $ time ./C 5500 +RTS -N5
   1.274224153
   ./C 5500 +RTS -N5  12.57s user 0.01s system 339% cpu 3.708 total

(Just add +RTS -N5 -RTS) Using 5 capabilities (to hide latency). Compared to current sequential entry,

   $ time ./A 5500
   1.274224153
   ./A 5500  12.56s user 0.01s system 99% cpu
   12.586 total

The code:

{-# LANGUAGE BangPatterns, MagicHash #-}
{-# OPTIONS -fvia-C -fexcess-precision #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Translation from Clean by Don Stewart
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
--                           2) changed -optc-O to -optc-O3
--                           3) added -optc-ffast-math
-- Parallelised by Don Stewart
--
-- Should be compiled with:
--      -O2 -threaded --make
--      -optc-O2 -optc-march=native -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
--
 
import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Internal
 
import GHC.Base
import GHC.Float
import GHC.Int
 
import Control.Concurrent
import Control.Exception
 
type Reals = Ptr Double
 
main = do
    n <- getArgs >>= readIO . head
 
    u <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff u i 1
    v <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff v i 0
 
    powerMethod 10 n u v
    printf "%.9f\n" (eigenvalue n u v 0 0 0)
 
------------------------------------------------------------------------
 
eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
    | i < n     = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
    | otherwise = sqrt $! vBv / vv
    where
       ui = inlinePerformIO (peekElemOff u i)
       vi = inlinePerformIO (peekElemOff v i)
 
------------------------------------------------------------------------
 
powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod i n u v = do
    -- roll our own synchronisation
    children <- newMVar []
    replicateM_ i $ do
        me <- newEmptyMVar
        cs <- takeMVar children
        putMVar children (me : cs)
        forkIO (child `finally` putMVar me ())
    wait children
  where
    child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u
 
    -- wait on children
    wait box = do
      cs <- takeMVar box
      case cs of
        []   -> return ()
        m:ms -> putMVar box ms >> takeMVar m >> wait box
 
-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
    timesAv  n u t
    timesAtv n t atau
 
timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff au i (avsum i 0 0)
        go (i+1)
 
    avsum :: Int -> Int -> Double -> Double
    avsum !i !j !acc
        | j < n = avsum i (j+1) (acc + ((aij i j) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)
 
timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff a i (atvsum i 0 0)
        go (i+1)
 
    atvsum :: Int -> Int -> Double -> Double
    atvsum !i !j !acc
        | j < n     = atvsum i (j+1) (acc + ((aij j i) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)
 
--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
    case i +# j of
        n -> case n *# (n+#1#) of
                t -> case t `uncheckedIShiftRA#` 1# of
                        u -> case u +# (i +# 1#) of
                                r -> 1.0## /## (int2Double# r))