# Benchmarks Game/Parallel/SpectralNorm

(Difference between revisions)

Parallel versions of spectral-norm

## 1 Slightly nicer

```{-# LANGUAGE BangPatterns, MagicHash #-}
{-# OPTIONS -fvia-C -fexcess-precision #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Translation from Clean by Don Stewart
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
--                           2) changed -optc-O to -optc-O3
-- Parallelised by Don Stewart
--
-- Should be compiled with:
--      -optc-O2 -optc-march=native -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
--

import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Data.ByteString.Internal

import GHC.Base
import GHC.Float
import GHC.Int

import Control.Concurrent
import Control.Exception

type Reals = Ptr Double

main = do

u <- mallocArray n :: IO Reals
forM_ [0..n-1] \$ \i -> pokeElemOff u i 1
v <- mallocArray n :: IO Reals
forM_ [0..n-1] \$ \i -> pokeElemOff v i 0

powerMethod 10 n u v
printf "%.9f\n" (eigenvalue n u v 0 0 0)

------------------------------------------------------------------------

eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
| i < n     = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
| otherwise = sqrt \$! vBv / vv
where
ui = inlinePerformIO (peekElemOff u i)
vi = inlinePerformIO (peekElemOff v i)

------------------------------------------------------------------------

powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod i n u v =
mapM_ takeMVar =<< replicateM i ( do me <- newEmptyMVar;
forkIO (child `finally` putMVar me ());
return me )
where
child = allocaArray n \$ \t -> timesAtAv t n u v >> timesAtAv t n v u

{-
powerMethod i n u v = do
-- roll our own synchronisation
children <- newMVar []
replicateM_ i \$ do
me <- newEmptyMVar
cs <- takeMVar children
putMVar children (me : cs)
forkIO (child `finally` putMVar me ())
wait children
where
child = allocaArray n \$ \t -> timesAtAv t n u v >> timesAtAv t n v u
-}

{-
-- wait on children
wait box = do
cs <- takeMVar box
case cs of
[]   -> return ()
m:ms -> putMVar box ms >> takeMVar m >> wait box
-}

-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
timesAv  n u t
timesAtv n t atau

timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
where
go :: Int -> IO ()
go !i = when (i < n) \$ do
pokeElemOff au i (avsum i 0 0)
go (i+1)

avsum :: Int -> Int -> Double -> Double
avsum !i !j !acc
| j < n = avsum i (j+1) (acc + ((aij i j) * uj))
| otherwise = acc
where uj = inlinePerformIO (peekElemOff u j)

timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
where
go :: Int -> IO ()
go !i = when (i < n) \$ do
pokeElemOff a i (atvsum i 0 0)
go (i+1)

atvsum :: Int -> Int -> Double -> Double
atvsum !i !j !acc
| j < n     = atvsum i (j+1) (acc + ((aij j i) * uj))
| otherwise = acc
where uj = inlinePerformIO (peekElemOff u j)

--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
case i +# j of
n -> case n *# (n+#1#) of
t -> case t `uncheckedIShiftRA#` 1# of
u -> case u +# (i +# 1#) of
r -> 1.0## /## (int2Double# r))```

## 2 Easy parallelisation

Takes the existing single core spectral-norm Haskell entry, and parallelises the main loop using explicit concurrency based on forkIO and MVars.

Should give reasonable cpu utilisation, and overall speedup.

Compiling:

```   \$ ghc C.hs -O2 -optc-O3 -optc-march=pentium4 -optc-mfpmath=sse
```

(That is, pretty much the same flags, but with -threaded) Running:

```   \$ time ./C 5500 +RTS -N5
1.274224153
./C 5500 +RTS -N5  12.57s user 0.01s system 339% cpu 3.708 total
```

(Just add +RTS -N5 -RTS) Using 5 capabilities (to hide latency). Compared to current sequential entry,

```   \$ time ./A 5500
1.274224153
./A 5500  12.56s user 0.01s system 99% cpu
12.586 total
```

The code:

```{-# LANGUAGE BangPatterns, MagicHash #-}
{-# OPTIONS -fvia-C -fexcess-precision #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Translation from Clean by Don Stewart
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
--                           2) changed -optc-O to -optc-O3
-- Parallelised by Don Stewart
--
-- Should be compiled with:
--      -optc-O2 -optc-march=native -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
--

import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Data.ByteString.Internal

import GHC.Base
import GHC.Float
import GHC.Int

import Control.Concurrent
import Control.Exception

type Reals = Ptr Double

main = do

u <- mallocArray n :: IO Reals
forM_ [0..n-1] \$ \i -> pokeElemOff u i 1
v <- mallocArray n :: IO Reals
forM_ [0..n-1] \$ \i -> pokeElemOff v i 0

powerMethod 10 n u v
printf "%.9f\n" (eigenvalue n u v 0 0 0)

------------------------------------------------------------------------

eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
| i < n     = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
| otherwise = sqrt \$! vBv / vv
where
ui = inlinePerformIO (peekElemOff u i)
vi = inlinePerformIO (peekElemOff v i)

------------------------------------------------------------------------

powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod i n u v = do
-- roll our own synchronisation
children <- newMVar []
replicateM_ i \$ do
me <- newEmptyMVar
cs <- takeMVar children
putMVar children (me : cs)
forkIO (child `finally` putMVar me ())
wait children
where
child = allocaArray n \$ \t -> timesAtAv t n u v >> timesAtAv t n v u

-- wait on children
wait box = do
cs <- takeMVar box
case cs of
[]   -> return ()
m:ms -> putMVar box ms >> takeMVar m >> wait box

-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
timesAv  n u t
timesAtv n t atau

timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
where
go :: Int -> IO ()
go !i = when (i < n) \$ do
pokeElemOff au i (avsum i 0 0)
go (i+1)

avsum :: Int -> Int -> Double -> Double
avsum !i !j !acc
| j < n = avsum i (j+1) (acc + ((aij i j) * uj))
| otherwise = acc
where uj = inlinePerformIO (peekElemOff u j)

timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
where
go :: Int -> IO ()
go !i = when (i < n) \$ do
pokeElemOff a i (atvsum i 0 0)
go (i+1)

atvsum :: Int -> Int -> Double -> Double
atvsum !i !j !acc
| j < n     = atvsum i (j+1) (acc + ((aij j i) * uj))
| otherwise = acc
where uj = inlinePerformIO (peekElemOff u j)

--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
case i +# j of
n -> case n *# (n+#1#) of
t -> case t `uncheckedIShiftRA#` 1# of
u -> case u +# (i +# 1#) of
r -> 1.0## /## (int2Double# r))```

## 3 ST/IOUArray-based solution

Based on the STUArray solution from Shootout/Spectral, converted to IOUArray for use of forkIO. (For use with ST it would require a forkAndWaitST implementation...)

Note that the benchmark requires synchronization between each iteration. This solution slices the output array into n parts and spawns one child per slice, waiting for every child to complete before continuing. (In theory, it's inefficient to repeatedly fork and synchronize like this, but it doesn't seem to cost that much.)

```{-# OPTIONS_GHC -fglasgow-exts #-}

--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Contributed by Simon Brenner
--
-- Compile with:
--
-- C-options:
--    -fvia-c -optc-O3 -optc-march=native -optc-mfpmath=sse -optc-msse2
--    -optc-ffast-math
-- GHC options:

import Control.Concurrent
import Control.Concurrent.MVar
import Control.Exception

import Data.Array.IO
import Data.Array.ST
import Data.Ix
import Data.Bits

import System.Environment
import Text.Printf

-- Various magic used for the unboxed version of the 'a' function
import GHC.Base
import GHC.Float

main = do
--print (spectralNorm n)
printf "%.9f" =<< spectralNormM forkAndWaitIO allocDoubleIO n

-- STUArray version not parallelised, so we're IO-only for now...
--spectralNorm :: Int -> Double
--spectralNorm n = runST (spectralNormM forkAndWaitST allocDouble n)

--------------------------------------------------------------------------
-- Benchmark code

caps = 4

spectralNormM :: (forall x. (x -> m ()) -> [x] -> m ()) -> Allocator m Int Double -> Int -> m Double
spectralNormM forkAndWait allocator n = case allocator of
(Allocator alloc) -> do
let ubound = n-1
u <- alloc (0,ubound)
v <- alloc (0,ubound)
tmp <- alloc (0,ubound)

fill u 1.0

replicateM_ 10 \$ do
multiplyAtAv forkAndWait 0 ubound u tmp v
multiplyAtAv forkAndWait 0 ubound v tmp u

vBv <- u .| v
vv <- v .| v

return \$ sqrt (vBv / vv)

forkChild child = do
m <- newEmptyMVar
forkIO \$ (child `finally` putMVar m ())
return m

waitForChildren = mapM_ takeMVar

forkAndWaitIO f xs = waitForChildren =<< mapM (forkChild . f) xs
-- For a serialized version that doesn't fork (but works in any monad)
--forkAndWait f xs = mapM_ f xs

multiplyAtAv forkAndWait lb ub v u atav = do
let forkChildren f = forkAndWait f (ranges lb caps ub)
forkChildren (multiplyAv v u)
forkChildren (multiplyAtv u atav)

multiplyAv v av (l,u) = forM_ [l..u] \$ \i -> do
writeArray av i 0.0
iterateArray v \$ \j -> do
modArray av i (+ ((a i j) * v'))

multiplyAtv v atv (l,u) = forM_ [l..u] \$ \i -> do
writeArray atv i 0.0
iterateArray v \$ \j -> do
modArray atv i (+ ((a j i) * v'))

a :: Int -> Int -> Double
--a i j = 1.0 / fromIntegral (((i + j) * (i + j + 1)) `shiftR` 1 + i + 1)
--a i j = 1.0 / ((i' + j') * ((i' + j' + 1) / 2) + i' + 1)
--  where i' = fromIntegral i; j' = fromIntegral j

a (I# i) (I# j) = D# (
case i +# j of
n -> case n *# (n+#1#) of
t -> case t `uncheckedIShiftRA#` 1# of
u -> case u +# (i +# 1#) of
r -> 1.0## /## (int2Double# r))

--------------------------------------------------------------------------
-- General utility code

fill u x = getBounds u >>= \(lb,ub) -> forM_ [lb..ub] (\i -> writeArray u i x)

u .| v = getBounds u >>= \(lb,ub) -> go lb ub u v 0
where
go lb ub u v acc
| lb <= ub = do
go (lb+1) ub u v (acc + u'*v')
| otherwise = return acc

-- Inline pragma: ~103x speedup :)
{-# INLINE modArray #-}
modArray a i f = writeArray a i . f =<< readArray a i

-- Inline pragma: ~18x speedup
{-# INLINE iterateArray #-}
iterateArray arr f = getBounds arr >>= \(lb,ub) -> forM_ [lb..ub] f

ranges lb n ub = go lb ((ub - lb) `div` n) ub
where
go x n y | x < y = (x, min (x+n) y) : go (x+n+1) n y
go _ _ _ = []

-- Ugly hack to allow polymorphically creating arrays in both ST and IO (required by ST's phantom 's' type)
data Allocator m i e = forall a. MArray a e m =>
Allocator ((i, i) -> m (a i e))

allocDouble :: Ix i => Allocator (ST s) i Double
allocDouble = Allocator \$
(newArray_ :: Ix i => (i,i) -> ST s (STUArray s i Double))

allocDoubleIO :: Ix i => Allocator IO i Double
allocDoubleIO = Allocator \$
(newArray_ :: Ix i => (i,i) -> IO (IOUArray i Double))```