Difference between revisions of "Benchmarks Game/Parallel/SpectralNorm"

From HaskellWiki
Jump to navigation Jump to search
m (Shootout/Parallel/SpectralNorm moved to Benchmarks Game/Parallel/SpectralNorm: The name of the benchmarks site has changed)
 
(3 intermediate revisions by 2 users not shown)
Line 1: Line 1:
Parallel version of [http://shootout.alioth.debian.org/u64q/benchmark.php?test=spectralnorm&lang=all spectral-norm]
+
Parallel versions of [http://shootout.alioth.debian.org/u64q/benchmark.php?test=spectralnorm&lang=all spectral-norm]
   
  +
== Slightly nicer ==
Easy parallelisation.
 
  +
  +
<haskell>
  +
{-# LANGUAGE BangPatterns, MagicHash #-}
  +
{-# OPTIONS -fvia-C -fexcess-precision #-}
  +
--
  +
-- The Computer Language Benchmarks Game
  +
-- http://shootout.alioth.debian.org/
  +
--
  +
-- Translation from Clean by Don Stewart
  +
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
  +
-- 2) changed -optc-O to -optc-O3
  +
-- 3) added -optc-ffast-math
  +
-- Parallelised by Don Stewart
  +
--
  +
-- Should be compiled with:
  +
-- -O2 -threaded --make
  +
-- -optc-O2 -optc-march=native -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
  +
--
  +
  +
import System
  +
import Foreign.Marshal.Array
  +
import Foreign
  +
import Text.Printf
  +
import Control.Monad
  +
import Data.ByteString.Internal
  +
  +
import GHC.Base
  +
import GHC.Float
  +
import GHC.Int
  +
  +
import Control.Concurrent
  +
import Control.Exception
  +
  +
type Reals = Ptr Double
  +
  +
main = do
  +
n <- getArgs >>= readIO . head
  +
  +
u <- mallocArray n :: IO Reals
  +
forM_ [0..n-1] $ \i -> pokeElemOff u i 1
  +
v <- mallocArray n :: IO Reals
  +
forM_ [0..n-1] $ \i -> pokeElemOff v i 0
  +
  +
powerMethod 10 n u v
  +
printf "%.9f\n" (eigenvalue n u v 0 0 0)
  +
  +
------------------------------------------------------------------------
  +
  +
eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
  +
eigenvalue !n !u !v !i !vBv !vv
  +
| i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
  +
| otherwise = sqrt $! vBv / vv
  +
where
  +
ui = inlinePerformIO (peekElemOff u i)
  +
vi = inlinePerformIO (peekElemOff v i)
  +
  +
------------------------------------------------------------------------
  +
  +
powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
  +
powerMethod i n u v =
  +
mapM_ takeMVar =<< replicateM i ( do me <- newEmptyMVar;
  +
forkIO (child `finally` putMVar me ());
  +
return me )
  +
where
  +
child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u
  +
  +
{-
  +
powerMethod i n u v = do
  +
-- roll our own synchronisation
  +
children <- newMVar []
  +
replicateM_ i $ do
  +
me <- newEmptyMVar
  +
cs <- takeMVar children
  +
putMVar children (me : cs)
  +
forkIO (child `finally` putMVar me ())
  +
wait children
  +
where
  +
child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u
  +
-}
  +
  +
{-
  +
-- wait on children
  +
wait box = do
  +
cs <- takeMVar box
  +
case cs of
  +
[] -> return ()
  +
m:ms -> putMVar box ms >> takeMVar m >> wait box
  +
-}
  +
  +
-- multiply vector v by matrix A and then by matrix A transposed
  +
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
  +
timesAtAv !t !n !u !atau = do
  +
timesAv n u t
  +
timesAtv n t atau
  +
  +
timesAv :: Int -> Reals -> Reals -> IO ()
  +
timesAv !n !u !au = go 0
  +
where
  +
go :: Int -> IO ()
  +
go !i = when (i < n) $ do
  +
pokeElemOff au i (avsum i 0 0)
  +
go (i+1)
  +
  +
avsum :: Int -> Int -> Double -> Double
  +
avsum !i !j !acc
  +
| j < n = avsum i (j+1) (acc + ((aij i j) * uj))
  +
| otherwise = acc
  +
where uj = inlinePerformIO (peekElemOff u j)
  +
  +
timesAtv :: Int -> Reals -> Reals -> IO ()
  +
timesAtv !n !u !a = go 0
  +
where
  +
go :: Int -> IO ()
  +
go !i = when (i < n) $ do
  +
pokeElemOff a i (atvsum i 0 0)
  +
go (i+1)
  +
  +
atvsum :: Int -> Int -> Double -> Double
  +
atvsum !i !j !acc
  +
| j < n = atvsum i (j+1) (acc + ((aij j i) * uj))
  +
| otherwise = acc
  +
where uj = inlinePerformIO (peekElemOff u j)
  +
  +
--
  +
-- manually unbox the inner loop:
  +
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
  +
--
  +
aij (I# i) (I# j) = D# (
  +
case i +# j of
  +
n -> case n *# (n+#1#) of
  +
t -> case t `uncheckedIShiftRA#` 1# of
  +
u -> case u +# (i +# 1#) of
  +
r -> 1.0## /## (int2Double# r))
  +
</haskell>
  +
  +
== Easy parallelisation ==
   
 
Takes the existing single core spectral-norm Haskell entry, and
 
Takes the existing single core spectral-norm Haskell entry, and
Line 28: Line 164:
 
./A 5500 12.56s user 0.01s system 99% cpu
 
./A 5500 12.56s user 0.01s system 99% cpu
 
12.586 total
 
12.586 total
  +
  +
The code:
   
 
<haskell>
 
<haskell>
Line 150: Line 288:
 
u -> case u +# (i +# 1#) of
 
u -> case u +# (i +# 1#) of
 
r -> 1.0## /## (int2Double# r))
 
r -> 1.0## /## (int2Double# r))
  +
</haskell>
  +
  +
== ST/IOUArray-based solution ==
  +
  +
Based on the STUArray solution from [[Shootout/Spectral]], converted to IOUArray for use of forkIO. (For use with ST it would require a forkAndWaitST implementation...)
  +
  +
Note that the benchmark requires synchronization between each iteration. This solution slices the output array into n parts and spawns one child per slice, waiting for every child to complete before continuing. (In theory, it's inefficient to repeatedly fork and synchronize like this, but it doesn't seem to cost that much.)
  +
  +
<haskell>
  +
{-# OPTIONS_GHC -fglasgow-exts #-}
  +
  +
--
  +
-- The Computer Language Benchmarks Game
  +
-- http://shootout.alioth.debian.org/
  +
--
  +
-- Contributed by Simon Brenner
  +
--
  +
-- Compile with:
  +
--
  +
-- C-options:
  +
-- -fvia-c -optc-O3 -optc-march=native -optc-mfpmath=sse -optc-msse2
  +
-- -optc-ffast-math
  +
-- GHC options:
  +
-- -O2 -fexcess-precision -threaded
  +
  +
import Control.Concurrent
  +
import Control.Concurrent.MVar
  +
import Control.Exception
  +
  +
import Control.Monad.ST
  +
import Control.Monad
  +
  +
import Data.Array.IO
  +
import Data.Array.ST
  +
import Data.Ix
  +
import Data.Bits
  +
  +
import System.Environment
  +
import Text.Printf
  +
  +
-- Various magic used for the unboxed version of the 'a' function
  +
import GHC.Base
  +
import GHC.Float
  +
  +
main = do
  +
n <- fmap (read . head) getArgs
  +
--print (spectralNorm n)
  +
printf "%.9f" =<< spectralNormM forkAndWaitIO allocDoubleIO n
  +
  +
-- STUArray version not parallelised, so we're IO-only for now...
  +
--spectralNorm :: Int -> Double
  +
--spectralNorm n = runST (spectralNormM forkAndWaitST allocDouble n)
  +
  +
--------------------------------------------------------------------------
  +
-- Benchmark code
  +
  +
caps = 4
  +
  +
spectralNormM :: (forall x. (x -> m ()) -> [x] -> m ()) -> Allocator m Int Double -> Int -> m Double
  +
spectralNormM forkAndWait allocator n = case allocator of
  +
(Allocator alloc) -> do
  +
let ubound = n-1
  +
u <- alloc (0,ubound)
  +
v <- alloc (0,ubound)
  +
tmp <- alloc (0,ubound)
  +
  +
fill u 1.0
  +
  +
replicateM_ 10 $ do
  +
multiplyAtAv forkAndWait 0 ubound u tmp v
  +
multiplyAtAv forkAndWait 0 ubound v tmp u
  +
  +
vBv <- u .| v
  +
vv <- v .| v
  +
  +
return $ sqrt (vBv / vv)
  +
  +
forkChild child = do
  +
m <- newEmptyMVar
  +
forkIO $ (child `finally` putMVar m ())
  +
return m
  +
  +
waitForChildren = mapM_ takeMVar
  +
  +
forkAndWaitIO f xs = waitForChildren =<< mapM (forkChild . f) xs
  +
-- For a serialized version that doesn't fork (but works in any monad)
  +
--forkAndWait f xs = mapM_ f xs
  +
  +
multiplyAtAv forkAndWait lb ub v u atav = do
  +
let forkChildren f = forkAndWait f (ranges lb caps ub)
  +
forkChildren (multiplyAv v u)
  +
forkChildren (multiplyAtv u atav)
  +
  +
multiplyAv v av (l,u) = forM_ [l..u] $ \i -> do
  +
writeArray av i 0.0
  +
iterateArray v $ \j -> do
  +
v' <- readArray v j
  +
modArray av i (+ ((a i j) * v'))
  +
  +
multiplyAtv v atv (l,u) = forM_ [l..u] $ \i -> do
  +
writeArray atv i 0.0
  +
iterateArray v $ \j -> do
  +
v' <- readArray v j
  +
modArray atv i (+ ((a j i) * v'))
  +
  +
a :: Int -> Int -> Double
  +
--a i j = 1.0 / fromIntegral (((i + j) * (i + j + 1)) `shiftR` 1 + i + 1)
  +
--a i j = 1.0 / ((i' + j') * ((i' + j' + 1) / 2) + i' + 1)
  +
-- where i' = fromIntegral i; j' = fromIntegral j
  +
  +
a (I# i) (I# j) = D# (
  +
case i +# j of
  +
n -> case n *# (n+#1#) of
  +
t -> case t `uncheckedIShiftRA#` 1# of
  +
u -> case u +# (i +# 1#) of
  +
r -> 1.0## /## (int2Double# r))
  +
  +
--------------------------------------------------------------------------
  +
-- General utility code
  +
  +
fill u x = getBounds u >>= \(lb,ub) -> forM_ [lb..ub] (\i -> writeArray u i x)
  +
  +
u .| v = getBounds u >>= \(lb,ub) -> go lb ub u v 0
  +
where
  +
go lb ub u v acc
  +
| lb <= ub = do
  +
u' <- readArray u lb
  +
v' <- readArray v lb
  +
go (lb+1) ub u v (acc + u'*v')
  +
| otherwise = return acc
  +
  +
-- Inline pragma: ~103x speedup :)
  +
{-# INLINE modArray #-}
  +
modArray a i f = writeArray a i . f =<< readArray a i
  +
  +
-- Inline pragma: ~18x speedup
  +
{-# INLINE iterateArray #-}
  +
iterateArray arr f = getBounds arr >>= \(lb,ub) -> forM_ [lb..ub] f
  +
  +
ranges lb n ub = go lb ((ub - lb) `div` n) ub
  +
where
  +
go x n y | x < y = (x, min (x+n) y) : go (x+n+1) n y
  +
go _ _ _ = []
  +
  +
-- Ugly hack to allow polymorphically creating arrays in both ST and IO (required by ST's phantom 's' type)
  +
data Allocator m i e = forall a. MArray a e m =>
  +
Allocator ((i, i) -> m (a i e))
  +
  +
allocDouble :: Ix i => Allocator (ST s) i Double
  +
allocDouble = Allocator $
  +
(newArray_ :: Ix i => (i,i) -> ST s (STUArray s i Double))
  +
  +
allocDoubleIO :: Ix i => Allocator IO i Double
  +
allocDoubleIO = Allocator $
  +
(newArray_ :: Ix i => (i,i) -> IO (IOUArray i Double))
 
</haskell>
 
</haskell>

Latest revision as of 22:33, 22 January 2012

Parallel versions of spectral-norm

Slightly nicer

{-# LANGUAGE BangPatterns, MagicHash #-}
{-# OPTIONS -fvia-C -fexcess-precision #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Translation from Clean by Don Stewart
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
--                           2) changed -optc-O to -optc-O3
--                           3) added -optc-ffast-math
-- Parallelised by Don Stewart
--
-- Should be compiled with:
--      -O2 -threaded --make
--      -optc-O2 -optc-march=native -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
--

import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Internal

import GHC.Base
import GHC.Float
import GHC.Int

import Control.Concurrent
import Control.Exception

type Reals = Ptr Double

main = do
    n <- getArgs >>= readIO . head

    u <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff u i 1
    v <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff v i 0

    powerMethod 10 n u v
    printf "%.9f\n" (eigenvalue n u v 0 0 0)

------------------------------------------------------------------------

eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
    | i < n     = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
    | otherwise = sqrt $! vBv / vv
    where
       ui = inlinePerformIO (peekElemOff u i)
       vi = inlinePerformIO (peekElemOff v i)

------------------------------------------------------------------------

powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod i n u v =
    mapM_ takeMVar =<< replicateM i ( do me <- newEmptyMVar;
                                         forkIO (child `finally` putMVar me ());
                                         return me )
   where
    child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u

{-
powerMethod i n u v = do
    -- roll our own synchronisation
    children <- newMVar []
    replicateM_ i $ do
        me <- newEmptyMVar
        cs <- takeMVar children
        putMVar children (me : cs)
        forkIO (child `finally` putMVar me ())
    wait children
  where
    child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u
-}

{-
    -- wait on children
    wait box = do
      cs <- takeMVar box
      case cs of
        []   -> return ()
        m:ms -> putMVar box ms >> takeMVar m >> wait box
-}  

-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
    timesAv  n u t
    timesAtv n t atau

timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff au i (avsum i 0 0)
        go (i+1)

    avsum :: Int -> Int -> Double -> Double
    avsum !i !j !acc
        | j < n = avsum i (j+1) (acc + ((aij i j) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)

timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff a i (atvsum i 0 0)
        go (i+1)

    atvsum :: Int -> Int -> Double -> Double
    atvsum !i !j !acc
        | j < n     = atvsum i (j+1) (acc + ((aij j i) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)

--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
    case i +# j of
        n -> case n *# (n+#1#) of
                t -> case t `uncheckedIShiftRA#` 1# of
                        u -> case u +# (i +# 1#) of
                                r -> 1.0## /## (int2Double# r))

Easy parallelisation

Takes the existing single core spectral-norm Haskell entry, and parallelises the main loop using explicit concurrency based on forkIO and MVars.

Should give reasonable cpu utilisation, and overall speedup.

Compiling:

   $ ghc C.hs -O2 -optc-O3 -optc-march=pentium4 -optc-mfpmath=sse
   -optc-msse2 -optc-ffast-math -threaded --make

(That is, pretty much the same flags, but with -threaded) Running:

   $ time ./C 5500 +RTS -N5
   1.274224153
   ./C 5500 +RTS -N5  12.57s user 0.01s system 339% cpu 3.708 total

(Just add +RTS -N5 -RTS) Using 5 capabilities (to hide latency). Compared to current sequential entry,

   $ time ./A 5500
   1.274224153
   ./A 5500  12.56s user 0.01s system 99% cpu
   12.586 total

The code:

{-# LANGUAGE BangPatterns, MagicHash #-}
{-# OPTIONS -fvia-C -fexcess-precision #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Translation from Clean by Don Stewart
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
--                           2) changed -optc-O to -optc-O3
--                           3) added -optc-ffast-math
-- Parallelised by Don Stewart
--
-- Should be compiled with:
--      -O2 -threaded --make
--      -optc-O2 -optc-march=native -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
--

import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Internal

import GHC.Base
import GHC.Float
import GHC.Int

import Control.Concurrent
import Control.Exception

type Reals = Ptr Double

main = do
    n <- getArgs >>= readIO . head

    u <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff u i 1
    v <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff v i 0

    powerMethod 10 n u v
    printf "%.9f\n" (eigenvalue n u v 0 0 0)

------------------------------------------------------------------------

eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
    | i < n     = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
    | otherwise = sqrt $! vBv / vv
    where
       ui = inlinePerformIO (peekElemOff u i)
       vi = inlinePerformIO (peekElemOff v i)

------------------------------------------------------------------------

powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod i n u v = do
    -- roll our own synchronisation
    children <- newMVar []
    replicateM_ i $ do
        me <- newEmptyMVar
        cs <- takeMVar children
        putMVar children (me : cs)
        forkIO (child `finally` putMVar me ())
    wait children
  where
    child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u

    -- wait on children
    wait box = do
      cs <- takeMVar box
      case cs of
        []   -> return ()
        m:ms -> putMVar box ms >> takeMVar m >> wait box

-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
    timesAv  n u t
    timesAtv n t atau

timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff au i (avsum i 0 0)
        go (i+1)

    avsum :: Int -> Int -> Double -> Double
    avsum !i !j !acc
        | j < n = avsum i (j+1) (acc + ((aij i j) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)

timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff a i (atvsum i 0 0)
        go (i+1)

    atvsum :: Int -> Int -> Double -> Double
    atvsum !i !j !acc
        | j < n     = atvsum i (j+1) (acc + ((aij j i) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)

--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
    case i +# j of
        n -> case n *# (n+#1#) of
                t -> case t `uncheckedIShiftRA#` 1# of
                        u -> case u +# (i +# 1#) of
                                r -> 1.0## /## (int2Double# r))

ST/IOUArray-based solution

Based on the STUArray solution from Shootout/Spectral, converted to IOUArray for use of forkIO. (For use with ST it would require a forkAndWaitST implementation...)

Note that the benchmark requires synchronization between each iteration. This solution slices the output array into n parts and spawns one child per slice, waiting for every child to complete before continuing. (In theory, it's inefficient to repeatedly fork and synchronize like this, but it doesn't seem to cost that much.)

{-# OPTIONS_GHC -fglasgow-exts #-}

--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Contributed by Simon Brenner
--
-- Compile with:
--
-- C-options:
--    -fvia-c -optc-O3 -optc-march=native -optc-mfpmath=sse -optc-msse2
--    -optc-ffast-math
-- GHC options:
--    -O2 -fexcess-precision -threaded

import Control.Concurrent
import Control.Concurrent.MVar
import Control.Exception

import Control.Monad.ST
import Control.Monad

import Data.Array.IO
import Data.Array.ST
import Data.Ix
import Data.Bits

import System.Environment
import Text.Printf

-- Various magic used for the unboxed version of the 'a' function
import GHC.Base
import GHC.Float

main = do
    n <- fmap (read . head) getArgs
    --print (spectralNorm n)
    printf "%.9f" =<< spectralNormM forkAndWaitIO allocDoubleIO n

-- STUArray version not parallelised, so we're IO-only for now...
--spectralNorm :: Int -> Double
--spectralNorm n = runST (spectralNormM forkAndWaitST allocDouble n)

--------------------------------------------------------------------------
-- Benchmark code

caps = 4

spectralNormM :: (forall x. (x -> m ()) -> [x] -> m ()) -> Allocator m Int Double -> Int -> m Double
spectralNormM forkAndWait allocator n = case allocator of
  (Allocator alloc) -> do
    let ubound = n-1
    u <- alloc (0,ubound)
    v <- alloc (0,ubound)
    tmp <- alloc (0,ubound)

    fill u 1.0

    replicateM_ 10 $ do
        multiplyAtAv forkAndWait 0 ubound u tmp v
        multiplyAtAv forkAndWait 0 ubound v tmp u
    
    vBv <- u .| v
    vv <- v .| v
    
    return $ sqrt (vBv / vv)

forkChild child = do
    m <- newEmptyMVar
    forkIO $ (child `finally` putMVar m ())
    return m

waitForChildren = mapM_ takeMVar

forkAndWaitIO f xs = waitForChildren =<< mapM (forkChild . f) xs
-- For a serialized version that doesn't fork (but works in any monad)
--forkAndWait f xs = mapM_ f xs

multiplyAtAv forkAndWait lb ub v u atav = do
    let forkChildren f = forkAndWait f (ranges lb caps ub)
    forkChildren (multiplyAv v u)
    forkChildren (multiplyAtv u atav)

multiplyAv v av (l,u) = forM_ [l..u] $ \i -> do
    writeArray av i 0.0
    iterateArray v $ \j -> do
        v' <- readArray v j
        modArray av i (+ ((a i j) * v'))

multiplyAtv v atv (l,u) = forM_ [l..u] $ \i -> do
    writeArray atv i 0.0
    iterateArray v $ \j -> do
        v' <- readArray v j
        modArray atv i (+ ((a j i) * v'))

a :: Int -> Int -> Double
--a i j = 1.0 / fromIntegral (((i + j) * (i + j + 1)) `shiftR` 1 + i + 1)
--a i j = 1.0 / ((i' + j') * ((i' + j' + 1) / 2) + i' + 1)
--  where i' = fromIntegral i; j' = fromIntegral j

a (I# i) (I# j) = D# (
    case i +# j of
        n -> case n *# (n+#1#) of
            t -> case t `uncheckedIShiftRA#` 1# of
                u -> case u +# (i +# 1#) of
                    r -> 1.0## /## (int2Double# r))

--------------------------------------------------------------------------
-- General utility code

fill u x = getBounds u >>= \(lb,ub) -> forM_ [lb..ub] (\i -> writeArray u i x)

u .| v = getBounds u >>= \(lb,ub) -> go lb ub u v 0
    where
        go lb ub u v acc
            | lb <= ub = do
                u' <- readArray u lb
                v' <- readArray v lb
                go (lb+1) ub u v (acc + u'*v')
            | otherwise = return acc

-- Inline pragma: ~103x speedup :)
{-# INLINE modArray #-}
modArray a i f = writeArray a i . f =<< readArray a i

-- Inline pragma: ~18x speedup
{-# INLINE iterateArray #-}
iterateArray arr f = getBounds arr >>= \(lb,ub) -> forM_ [lb..ub] f

ranges lb n ub = go lb ((ub - lb) `div` n) ub
    where
        go x n y | x < y = (x, min (x+n) y) : go (x+n+1) n y
        go _ _ _ = []

-- Ugly hack to allow polymorphically creating arrays in both ST and IO (required by ST's phantom 's' type)
data Allocator m i e = forall a. MArray a e m => 
                       Allocator ((i, i) -> m (a i e))

allocDouble :: Ix i => Allocator (ST s) i Double
allocDouble = Allocator $
    (newArray_ :: Ix i => (i,i) -> ST s (STUArray s i Double))

allocDoubleIO :: Ix i => Allocator IO i Double
allocDoubleIO = Allocator $
    (newArray_ :: Ix i => (i,i) -> IO (IOUArray i Double))