Shootout/Spectral
From HaskellWiki
(→Proposed entry) |
m |
||
| (4 intermediate revisions not shown.) | |||
| Line 1: | Line 1: | ||
| - | |||
A Shootout Entry for the [http://shootout.alioth.debian.org/gp4/benchmark.php?test=spectralnorm&lang=all spectral norm test] | A Shootout Entry for the [http://shootout.alioth.debian.org/gp4/benchmark.php?test=spectralnorm&lang=all spectral norm test] | ||
| Line 9: | Line 8: | ||
||dons || 9.407|| | ||dons || 9.407|| | ||
||old || 10.564|| | ||old || 10.564|| | ||
| + | |||
| + | == Current entry == | ||
| + | |||
| + | Damn fast. | ||
| + | |||
| + | <haskell> | ||
| + | {-# OPTIONS -fvia-C -fexcess-precision #-} | ||
| + | -- | ||
| + | -- The Computer Language Benchmarks Game | ||
| + | -- http://shootout.alioth.debian.org/ | ||
| + | -- | ||
| + | -- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA# | ||
| + | -- 2) changed -optc-O to -optc-O3 | ||
| + | -- 3) added -optc-ffast-math | ||
| + | -- Translation from Clean by Don Stewart | ||
| + | -- | ||
| + | -- Should be compiled with: | ||
| + | -- -O -fglasgow-exts -fbang-patterns | ||
| + | -- -optc-O3 -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 -optc-ffast-math | ||
| + | -- | ||
| + | -- Note: The following flags appear to INCREASE running time: | ||
| + | -- -O2 -optc-funroll-loops | ||
| + | |||
| + | import System | ||
| + | import Foreign.Marshal.Array | ||
| + | import Foreign | ||
| + | import Text.Printf | ||
| + | import Control.Monad | ||
| + | import Data.ByteString.Internal | ||
| + | |||
| + | import GHC.Base | ||
| + | import GHC.Float | ||
| + | import GHC.Int | ||
| + | |||
| + | type Reals = Ptr Double | ||
| + | |||
| + | main = do | ||
| + | n <- getArgs >>= readIO . head | ||
| + | |||
| + | u <- mallocArray n :: IO Reals | ||
| + | forM_ [0..n-1] $ \i -> pokeElemOff u i 1 | ||
| + | v <- mallocArray n :: IO Reals | ||
| + | forM_ [0..n-1] $ \i -> pokeElemOff v i 0 | ||
| + | |||
| + | powerMethod 10 n u v | ||
| + | printf "%.9f\n" (eigenvalue n u v 0 0 0) | ||
| + | |||
| + | ------------------------------------------------------------------------ | ||
| + | |||
| + | eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double | ||
| + | eigenvalue !n !u !v !i !vBv !vv | ||
| + | | i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi) | ||
| + | | otherwise = sqrt $! vBv / vv | ||
| + | where | ||
| + | ui = inlinePerformIO (peekElemOff u i) | ||
| + | vi = inlinePerformIO (peekElemOff v i) | ||
| + | |||
| + | ------------------------------------------------------------------------ | ||
| + | |||
| + | powerMethod :: Int -> Int -> Reals -> Reals -> IO () | ||
| + | powerMethod !i !n !u !v = allocaArray n $ \t -> | ||
| + | replicateM_ i $ timesAtAv t n u v >> timesAtAv t n v u | ||
| + | |||
| + | -- multiply vector v by matrix A and then by matrix A transposed | ||
| + | timesAtAv :: Reals -> Int -> Reals -> Reals -> IO () | ||
| + | timesAtAv !t !n !u !atau = do | ||
| + | timesAv n u t | ||
| + | timesAtv n t atau | ||
| + | |||
| + | timesAv :: Int -> Reals -> Reals -> IO () | ||
| + | timesAv !n !u !au = go 0 | ||
| + | where | ||
| + | go :: Int -> IO () | ||
| + | go !i = when (i < n) $ do | ||
| + | pokeElemOff au i (avsum i 0 0) | ||
| + | go (i+1) | ||
| + | |||
| + | avsum :: Int -> Int -> Double -> Double | ||
| + | avsum !i !j !acc | ||
| + | | j < n = avsum i (j+1) (acc + ((aij i j) * uj)) | ||
| + | | otherwise = acc | ||
| + | where uj = inlinePerformIO (peekElemOff u j) | ||
| + | |||
| + | timesAtv :: Int -> Reals -> Reals -> IO () | ||
| + | timesAtv !n !u !a = go 0 | ||
| + | where | ||
| + | go :: Int -> IO () | ||
| + | go !i = when (i < n) $ do | ||
| + | pokeElemOff a i (atvsum i 0 0) | ||
| + | go (i+1) | ||
| + | |||
| + | atvsum :: Int -> Int -> Double -> Double | ||
| + | atvsum !i !j !acc | ||
| + | | j < n = atvsum i (j+1) (acc + ((aij j i) * uj)) | ||
| + | | otherwise = acc | ||
| + | where uj = inlinePerformIO (peekElemOff u j) | ||
| + | |||
| + | -- | ||
| + | -- manually unbox the inner loop: | ||
| + | -- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1) | ||
| + | -- | ||
| + | aij (I# i) (I# j) = D# ( | ||
| + | case i +# j of | ||
| + | n -> case n *# (n+#1#) of | ||
| + | t -> case t `uncheckedIShiftRA#` 1# of | ||
| + | u -> case u +# (i +# 1#) of | ||
| + | r -> 1.0## /## (int2Double# r)) | ||
| + | </haskell> | ||
| + | |||
| + | == Proposed entry == | ||
| + | |||
| + | Uses Ptr Double, and carefully unboxes the aij inner computation (where 70% of the time is spent). Runs 2.3x slower than C on my p4. | ||
| + | |||
| + | [https://alioth.debian.org/tracker/index.php?func=detail&aid=304453&group_id=30402&atid=411646 Submitted] | ||
| + | |||
| + | <haskell> | ||
| + | {-# OPTIONS -fexcess-precision #-} | ||
| + | -- | ||
| + | -- The Computer Language Shootout | ||
| + | -- http://shootout.alioth.debian.org/ | ||
| + | -- | ||
| + | -- Translation from Clean by Don Stewart | ||
| + | -- | ||
| + | -- Should be compiled with: | ||
| + | -- -O -fglasgow-exts -fbang-patterns | ||
| + | -- -optc-O -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 | ||
| + | -- | ||
| + | |||
| + | import System | ||
| + | import Foreign.Marshal.Array | ||
| + | import Foreign | ||
| + | import Text.Printf | ||
| + | import Control.Monad | ||
| + | import Data.ByteString.Base | ||
| + | |||
| + | import GHC.Base | ||
| + | import GHC.Float | ||
| + | import GHC.Int | ||
| + | |||
| + | type Reals = Ptr Double | ||
| + | |||
| + | main = do | ||
| + | n <- getArgs >>= readIO . head | ||
| + | |||
| + | u <- mallocArray n :: IO Reals | ||
| + | forM_ [0..n-1] $ \i -> pokeElemOff u i 1 | ||
| + | v <- mallocArray n :: IO Reals | ||
| + | forM_ [0..n-1] $ \i -> pokeElemOff v i 0 | ||
| + | |||
| + | powerMethod 10 n u v | ||
| + | printf "%.9f\n" (eigenvalue n u v 0 0 0) | ||
| + | |||
| + | ------------------------------------------------------------------------ | ||
| + | |||
| + | eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double | ||
| + | eigenvalue !n !u !v !i !vBv !vv | ||
| + | | i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi) | ||
| + | | otherwise = sqrt $! vBv / vv | ||
| + | where | ||
| + | ui = inlinePerformIO (peekElemOff u i) | ||
| + | vi = inlinePerformIO (peekElemOff v i) | ||
| + | |||
| + | ------------------------------------------------------------------------ | ||
| + | |||
| + | powerMethod :: Int -> Int -> Reals -> Reals -> IO () | ||
| + | powerMethod !i !n !u !v = allocaArray n $ \t -> | ||
| + | replicateM_ i $ timesAtAv t n u v >> timesAtAv t n v u | ||
| + | |||
| + | -- multiply vector v by matrix A and then by matrix A transposed | ||
| + | timesAtAv :: Reals -> Int -> Reals -> Reals -> IO () | ||
| + | timesAtAv !t !n !u !atau = do | ||
| + | timesAv n u t | ||
| + | timesAtv n t atau | ||
| + | {-# INLINE timesAtAv #-} | ||
| + | |||
| + | timesAv :: Int -> Reals -> Reals -> IO () | ||
| + | timesAv !n !u !au = go 0 | ||
| + | where | ||
| + | go :: Int -> IO () | ||
| + | go !i = when (i < n) $ do | ||
| + | pokeElemOff au i (avsum i 0 0) | ||
| + | go (i+1) | ||
| + | |||
| + | avsum :: Int -> Int -> Double -> Double | ||
| + | avsum !i !j !acc | ||
| + | | j < n = avsum i (j+1) (acc + ((aij i j) * uj)) | ||
| + | | otherwise = acc | ||
| + | where uj = inlinePerformIO (peekElemOff u j) | ||
| + | {-# INLINE timesAv #-} | ||
| + | |||
| + | timesAtv :: Int -> Reals -> Reals -> IO () | ||
| + | timesAtv !n !u !a = go 0 | ||
| + | where | ||
| + | go :: Int -> IO () | ||
| + | go !i = when (i < n) $ do | ||
| + | pokeElemOff a i (atvsum i 0 0) | ||
| + | go (i+1) | ||
| + | |||
| + | atvsum :: Int -> Int -> Double -> Double | ||
| + | atvsum !i !j !acc | ||
| + | | j < n = atvsum i (j+1) (acc + ((aij j i) * uj)) | ||
| + | | otherwise = acc | ||
| + | where uj = inlinePerformIO (peekElemOff u j) | ||
| + | {-# INLINE timesAtv #-} | ||
| + | |||
| + | -- | ||
| + | -- manually unbox the inner loop: | ||
| + | -- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1) | ||
| + | -- | ||
| + | aij (I# i) (I# j) = D# ( | ||
| + | case i +# j of | ||
| + | n -> case n *# (n+#1#) of | ||
| + | t -> case divInt# t 2# of | ||
| + | u -> case u +# (i +# 1#) of | ||
| + | r -> 1.0## /## (int2Double# r)) | ||
| + | </haskell> | ||
== Current entry == | == Current entry == | ||
| Line 14: | Line 229: | ||
Better gcc flags. Bang patterns. | Better gcc flags. Bang patterns. | ||
[https://alioth.debian.org/tracker/index.php?func=detail&aid=304453&group_id=30402&atid=411646 Submitted] | [https://alioth.debian.org/tracker/index.php?func=detail&aid=304453&group_id=30402&atid=411646 Submitted] | ||
| + | |||
| + | Didn't do so well in practice though :/ | ||
<haskell> | <haskell> | ||
| Line 24: | Line 241: | ||
-- | -- | ||
| - | import Monad | + | import Control.Monad |
import System | import System | ||
import Text.Printf | import Text.Printf | ||
| Line 35: | Line 252: | ||
u <- newArray (0,n-1) 1 :: IO (IOUArray Int Double) | u <- newArray (0,n-1) 1 :: IO (IOUArray Int Double) | ||
v <- newArray (0,n-1) 0 :: IO (IOUArray Int Double) | v <- newArray (0,n-1) 0 :: IO (IOUArray Int Double) | ||
| - | + | replicateM_ 10 $ multiplyAtAv n u v >> multiplyAtAv n v u | |
let loop !vbv !vv !i | let loop !vbv !vv !i | ||
| Line 87: | Line 304: | ||
-- | -- | ||
| - | import Monad | + | import Control.Monad |
import System | import System | ||
import Numeric | import Numeric | ||
| Line 98: | Line 315: | ||
u <- newArray (0,n-1) 1 :: IO (IOUArray Int Double) | u <- newArray (0,n-1) 1 :: IO (IOUArray Int Double) | ||
v <- newArray_ (0,n-1) :: IO (IOUArray Int Double) | v <- newArray_ (0,n-1) :: IO (IOUArray Int Double) | ||
| - | + | replicateM_ 10 $ multiplyAtAv n u v >> multiplyAtAv n v u | |
loop 0 0 0 n u v | loop 0 0 0 n u v | ||
| Line 145: | Line 362: | ||
-- Conversion to Haskell by Einar Karttunen | -- Conversion to Haskell by Einar Karttunen | ||
| + | import Control.Monad | ||
import Control.Monad.ST | import Control.Monad.ST | ||
import Data.Array.Base | import Data.Array.Base | ||
| Line 179: | Line 397: | ||
let (vBv,vv) = runST (do u <- newArray (0,n-1) 1 | let (vBv,vv) = runST (do u <- newArray (0,n-1) 1 | ||
v <- newArray (0,n-1) 0 | v <- newArray (0,n-1) 0 | ||
| - | + | replicateM_ 10 (eval_AtA_Times_u u v >> eval_AtA_Times_u v u) | |
vLoop u v n (0, 0)) | vLoop u v n (0, 0)) | ||
putStrLn $ showFFloat (Just 9) (sqrt (vBv/vv)) "" | putStrLn $ showFFloat (Just 9) (sqrt (vBv/vv)) "" | ||
| Line 189: | Line 407: | ||
vi <- unsafeRead v i | vi <- unsafeRead v i | ||
return (vBv+(ui*vi),vv+(vi*vi)) | return (vBv+(ui*vi),vv+(vi*vi)) | ||
| + | </haskell> | ||
| + | |||
| + | == STUArray-based solution == | ||
| + | |||
| + | One-line-change convertible to IOUArray, which seems to be exactly as fast as STUArray. The important part seems to be the unboxed version of the 'a' function and the iterateM_ hack (which I don't quite know how it works at all). | ||
| + | |||
| + | Otherwise, it seems to be just about exactly as fast as the unsafe Ptr Double versions. | ||
| + | |||
| + | <haskell> | ||
| + | {-# OPTIONS_GHC -fglasgow-exts #-} | ||
| + | |||
| + | -- | ||
| + | -- The Computer Language Benchmarks Game | ||
| + | -- http://shootout.alioth.debian.org/ | ||
| + | -- | ||
| + | -- Contributed by Simon Brenner | ||
| + | -- | ||
| + | -- Compile with: | ||
| + | -- | ||
| + | -- C-options: | ||
| + | -- -fvia-c -optc-O3 -optc-march=native -optc-mfpmath=sse -optc-msse2 | ||
| + | -- -optc-ffast-math | ||
| + | -- GHC options: | ||
| + | -- -O2 -fexcess-precision | ||
| + | |||
| + | import Control.Monad.ST | ||
| + | import Control.Monad | ||
| + | |||
| + | import Data.Array.IO | ||
| + | import Data.Array.ST | ||
| + | import Data.Ix | ||
| + | import Data.Bits | ||
| + | |||
| + | import System.Environment | ||
| + | |||
| + | -- Various magic used for the unboxed version of the 'a' function | ||
| + | import GHC.Base | ||
| + | import GHC.Float | ||
| + | |||
| + | main = do | ||
| + | n <- fmap (read . head) getArgs | ||
| + | print (spectralNorm n) | ||
| + | -- Use this variant for IOUArray rather than STUArray | ||
| + | --print =<< spectralNormM allocDoubleIO n | ||
| + | |||
| + | spectralNorm :: Int -> Double | ||
| + | spectralNorm n = runST (spectralNormM allocDouble n) | ||
| + | |||
| + | ----------------- | ||
| + | -- Benchmark code | ||
| + | |||
| + | spectralNormM allocator n = case allocator of | ||
| + | (Allocator alloc) -> do | ||
| + | let ubound = n-1 | ||
| + | u <- alloc (0,ubound) | ||
| + | v <- alloc (0,ubound) | ||
| + | tmp <- alloc (0,ubound) | ||
| + | |||
| + | fill u 1.0 | ||
| + | |||
| + | replicateM_ 10 $ do | ||
| + | multiplyAtAv ubound u tmp v | ||
| + | multiplyAtAv ubound v tmp u | ||
| + | |||
| + | vBv <- u .| v | ||
| + | vv <- v .| v | ||
| + | |||
| + | return $ sqrt (vBv / vv) | ||
| + | |||
| + | multiplyAtAv n v u atav = do | ||
| + | multiplyAv n v u | ||
| + | multiplyAtv n u atav | ||
| + | |||
| + | multiplyAv n v av = iterateM_ 0 n $ \i -> do | ||
| + | writeArray av i 0.0 | ||
| + | iterateM_ 0 n $ \j -> do | ||
| + | v' <- readArray v j | ||
| + | modArray av i (+ ((a i j) * v')) | ||
| + | |||
| + | multiplyAtv n v atv = iterateM_ 0 n $ \i -> do | ||
| + | writeArray atv i 0.0 | ||
| + | iterateM_ 0 n $ \j -> do | ||
| + | v' <- readArray v j | ||
| + | modArray atv i (+ ((a j i) * v')) | ||
| + | |||
| + | a :: Int -> Int -> Double | ||
| + | --a i j = 1.0 / fromIntegral (((i + j) * (i + j + 1)) `shiftR` 1 + i + 1) | ||
| + | --a i j = 1.0 / (((i' + j') * (i' + j' + 1) / 2) + i' + 1) | ||
| + | -- where i' = fromIntegral i; j' = fromIntegral j | ||
| + | a (I# i) (I# j) = D# ( | ||
| + | case i +# j of | ||
| + | n -> case n *# (n+#1#) of | ||
| + | t -> case t `uncheckedIShiftRA#` 1# of | ||
| + | u -> case u +# (i +# 1#) of | ||
| + | r -> 1.0## /## (int2Double# r)) | ||
| + | |||
| + | ----------------------- | ||
| + | -- General utility code | ||
| + | |||
| + | fill u x = getBounds u >>= \(lb,ub) -> forM_ [lb..ub] (\i -> writeArray u i x) | ||
| + | |||
| + | u .| v = getBounds u >>= \(lb,ub) -> go lb ub u v 0 | ||
| + | where | ||
| + | go lb ub u v acc | ||
| + | | lb <= ub = do | ||
| + | u' <- readArray u lb | ||
| + | v' <- readArray v lb | ||
| + | go (lb+1) ub u v (acc + u'*v') | ||
| + | | otherwise = return acc | ||
| + | |||
| + | -- Explicitly inlining this seems to stand for about a 24x speedup :) | ||
| + | {-# INLINE modArray #-} | ||
| + | modArray a i f = writeArray a i . f =<< readArray a i | ||
| + | |||
| + | -- This is really weird! Replacing iterateM_ x y with forM_ [x..y] in the loops | ||
| + | -- above makes the program go twice as slowly. (Really!) | ||
| + | |||
| + | {-# INLINE iterateM_ #-} | ||
| + | {-# RULES "inline-iterateM_" iterateM_ = \lb ub f -> forM_ [lb..ub] f #-} | ||
| + | iterateM_ lb ub = forM_ [lb..ub] | ||
| + | |||
| + | data Allocator m i e = forall a. MArray a e m => | ||
| + | Allocator ((i, i) -> m (a i e)) | ||
| + | |||
| + | allocDouble :: Ix i => Allocator (ST s) i Double | ||
| + | allocDouble = Allocator $ | ||
| + | (newArray_ :: Ix i => (i,i) -> ST s (STUArray s i Double)) | ||
| + | |||
| + | allocDoubleIO :: Ix i => Allocator IO i Double | ||
| + | allocDoubleIO = Allocator $ | ||
| + | (newArray_ :: Ix i => (i,i) -> IO (IOUArray i Double)) | ||
</haskell> | </haskell> | ||
[[Category:Code]] | [[Category:Code]] | ||
Current revision
A Shootout Entry for the spectral norm test
Contents |
1 Timings
Debian Linux x86, n=2,500
||Entry || Time|| ||dons || 9.407|| ||old || 10.564||
2 Current entry
Damn fast.
{-# OPTIONS -fvia-C -fexcess-precision #-} -- -- The Computer Language Benchmarks Game -- http://shootout.alioth.debian.org/ -- -- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA# -- 2) changed -optc-O to -optc-O3 -- 3) added -optc-ffast-math -- Translation from Clean by Don Stewart -- -- Should be compiled with: -- -O -fglasgow-exts -fbang-patterns -- -optc-O3 -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 -optc-ffast-math -- -- Note: The following flags appear to INCREASE running time: -- -O2 -optc-funroll-loops import System import Foreign.Marshal.Array import Foreign import Text.Printf import Control.Monad import Data.ByteString.Internal import GHC.Base import GHC.Float import GHC.Int type Reals = Ptr Double main = do n <- getArgs >>= readIO . head u <- mallocArray n :: IO Reals forM_ [0..n-1] $ \i -> pokeElemOff u i 1 v <- mallocArray n :: IO Reals forM_ [0..n-1] $ \i -> pokeElemOff v i 0 powerMethod 10 n u v printf "%.9f\n" (eigenvalue n u v 0 0 0) ------------------------------------------------------------------------ eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double eigenvalue !n !u !v !i !vBv !vv | i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi) | otherwise = sqrt $! vBv / vv where ui = inlinePerformIO (peekElemOff u i) vi = inlinePerformIO (peekElemOff v i) ------------------------------------------------------------------------ powerMethod :: Int -> Int -> Reals -> Reals -> IO () powerMethod !i !n !u !v = allocaArray n $ \t -> replicateM_ i $ timesAtAv t n u v >> timesAtAv t n v u -- multiply vector v by matrix A and then by matrix A transposed timesAtAv :: Reals -> Int -> Reals -> Reals -> IO () timesAtAv !t !n !u !atau = do timesAv n u t timesAtv n t atau timesAv :: Int -> Reals -> Reals -> IO () timesAv !n !u !au = go 0 where go :: Int -> IO () go !i = when (i < n) $ do pokeElemOff au i (avsum i 0 0) go (i+1) avsum :: Int -> Int -> Double -> Double avsum !i !j !acc | j < n = avsum i (j+1) (acc + ((aij i j) * uj)) | otherwise = acc where uj = inlinePerformIO (peekElemOff u j) timesAtv :: Int -> Reals -> Reals -> IO () timesAtv !n !u !a = go 0 where go :: Int -> IO () go !i = when (i < n) $ do pokeElemOff a i (atvsum i 0 0) go (i+1) atvsum :: Int -> Int -> Double -> Double atvsum !i !j !acc | j < n = atvsum i (j+1) (acc + ((aij j i) * uj)) | otherwise = acc where uj = inlinePerformIO (peekElemOff u j) -- -- manually unbox the inner loop: -- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1) -- aij (I# i) (I# j) = D# ( case i +# j of n -> case n *# (n+#1#) of t -> case t `uncheckedIShiftRA#` 1# of u -> case u +# (i +# 1#) of r -> 1.0## /## (int2Double# r))
3 Proposed entry
Uses Ptr Double, and carefully unboxes the aij inner computation (where 70% of the time is spent). Runs 2.3x slower than C on my p4.
{-# OPTIONS -fexcess-precision #-} -- -- The Computer Language Shootout -- http://shootout.alioth.debian.org/ -- -- Translation from Clean by Don Stewart -- -- Should be compiled with: -- -O -fglasgow-exts -fbang-patterns -- -optc-O -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 -- import System import Foreign.Marshal.Array import Foreign import Text.Printf import Control.Monad import Data.ByteString.Base import GHC.Base import GHC.Float import GHC.Int type Reals = Ptr Double main = do n <- getArgs >>= readIO . head u <- mallocArray n :: IO Reals forM_ [0..n-1] $ \i -> pokeElemOff u i 1 v <- mallocArray n :: IO Reals forM_ [0..n-1] $ \i -> pokeElemOff v i 0 powerMethod 10 n u v printf "%.9f\n" (eigenvalue n u v 0 0 0) ------------------------------------------------------------------------ eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double eigenvalue !n !u !v !i !vBv !vv | i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi) | otherwise = sqrt $! vBv / vv where ui = inlinePerformIO (peekElemOff u i) vi = inlinePerformIO (peekElemOff v i) ------------------------------------------------------------------------ powerMethod :: Int -> Int -> Reals -> Reals -> IO () powerMethod !i !n !u !v = allocaArray n $ \t -> replicateM_ i $ timesAtAv t n u v >> timesAtAv t n v u -- multiply vector v by matrix A and then by matrix A transposed timesAtAv :: Reals -> Int -> Reals -> Reals -> IO () timesAtAv !t !n !u !atau = do timesAv n u t timesAtv n t atau {-# INLINE timesAtAv #-} timesAv :: Int -> Reals -> Reals -> IO () timesAv !n !u !au = go 0 where go :: Int -> IO () go !i = when (i < n) $ do pokeElemOff au i (avsum i 0 0) go (i+1) avsum :: Int -> Int -> Double -> Double avsum !i !j !acc | j < n = avsum i (j+1) (acc + ((aij i j) * uj)) | otherwise = acc where uj = inlinePerformIO (peekElemOff u j) {-# INLINE timesAv #-} timesAtv :: Int -> Reals -> Reals -> IO () timesAtv !n !u !a = go 0 where go :: Int -> IO () go !i = when (i < n) $ do pokeElemOff a i (atvsum i 0 0) go (i+1) atvsum :: Int -> Int -> Double -> Double atvsum !i !j !acc | j < n = atvsum i (j+1) (acc + ((aij j i) * uj)) | otherwise = acc where uj = inlinePerformIO (peekElemOff u j) {-# INLINE timesAtv #-} -- -- manually unbox the inner loop: -- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1) -- aij (I# i) (I# j) = D# ( case i +# j of n -> case n *# (n+#1#) of t -> case divInt# t 2# of u -> case u +# (i +# 1#) of r -> 1.0## /## (int2Double# r))
4 Current entry
Better gcc flags. Bang patterns. Submitted
Didn't do so well in practice though :/
{-# OPTIONS -O -fglasgow-exts -fbang-patterns -funbox-strict-fields -fexcess-precision -optc-O2 -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 #-} -- -- The Great Computer Language Shootout -- http:--shootout.alioth.debian.org/ -- -- Contributed by Don Stewart -- import Control.Monad import System import Text.Printf import Data.Array.IO import Data.Array.Base main = getArgs >>= approximate . read . head >>= printf "%.9f\n" approximate n = do u <- newArray (0,n-1) 1 :: IO (IOUArray Int Double) v <- newArray (0,n-1) 0 :: IO (IOUArray Int Double) replicateM_ 10 $ multiplyAtAv n u v >> multiplyAtAv n v u let loop !vbv !vv !i | i >= n = return (vbv,vv) | otherwise = do ui <- unsafeRead u i vi <- unsafeRead v i loop (vbv + ui * vi) (vv + vi * vi) (i+1) (vbv,vv) <- loop 0 0 0 return $! sqrt (vbv/vv) -- return element i,j of infinite matrix A a i j = 1 / fromIntegral (x*(x+1) `div` 2 + i + 1) where x = i+j -- multiply vector v by matrix A */ multiplyAv !n !v !av = loop 0 where loop i = when (i < n) $ do avi <- loop' i 0 0 unsafeWrite av i avi >> loop (i+1) loop' !i !j !av | j >= n = return av | otherwise = do vj <- unsafeRead v j loop' i (j+1) (av + a i j * vj) -- multiply vector v by matrix A transposed multiplyAtv !n !v !atv = loop 0 where loop i = when (i < n) $ do atvi <- loop' i 0 0 unsafeWrite atv i atvi >> loop (i+1) loop' !i !j !atvi | j >= n = return atvi | otherwise = do vj <- unsafeRead v j loop' i (j+1) (atvi + a j i * vj) -- multiply vector v by matrix A and then by matrix A transposed */ multiplyAtAv !n !v !atav = do u <- newArray (0,n-1) 0 :: IO (IOUArray Int Double) multiplyAv n v u >> multiplyAtv n u atav
5 Old entry
GHC unboxed this better. Careful attention payed to the unboxing. -O2 -optc-O -optc-ffast-math -fexcess-precision
{-# OPTIONS -optc-O #-} -- -- The Great Computer Language Shootout -- http:--shootout.alioth.debian.org/ -- -- Contributed by Don Stewart -- -- gcc miscompiles this program with -O3 -- import Control.Monad import System import Numeric import Data.Array.IO import Data.Array.Base main = getArgs >>= approximate . read . head >>= putStrLn . \s -> showFFloat (Just 9) s [] approximate n = do u <- newArray (0,n-1) 1 :: IO (IOUArray Int Double) v <- newArray_ (0,n-1) :: IO (IOUArray Int Double) replicateM_ 10 $ multiplyAtAv n u v >> multiplyAtAv n v u loop 0 0 0 n u v loop vbv vv i n u v | vbv `seq` vv `seq` i `seq` n `seq` u `seq` v `seq` False = undefined loop vbv vv i n u v | i >= n = return $! sqrt (vbv/vv) | otherwise = do ui <- unsafeRead u i vi <- unsafeRead v i loop (vbv + ui * vi) (vv + vi * vi) (i+1) n u v -- return element i,j of infinite matrix A a i j | i `seq` j `seq` False = undefined a i j = 1 / fromIntegral (x*(x+1) `div` 2 + i + 1) where x = i+j -- multiply vector v by matrix A */ multiplyAv n v av | n `seq` v `seq` av `seq` False = undefined multiplyAv n v av = loop 0 where loop i = when (i < n) $ loop' i 0 0 >>= unsafeWrite av i >> loop (i+1) loop' i j av | i `seq` j `seq` av `seq` False = undefined loop' i j av | j >= n = return av | otherwise = do vj <- v `unsafeRead` j loop' i (j+1) (av + a i j * vj) -- multiply vector v by matrix A transposed multiplyAtv n v atv | n `seq` v `seq` atv `seq` False = undefined multiplyAtv n v atv = loop 0 where loop i = when (i < n) $ loop' i 0 0 >>= unsafeWrite atv i >> loop (i+1) loop' i j atvi | j `seq` atvi `seq` False = undefined loop' i j atvi | j >= n = return atvi | otherwise = do vj <- v `unsafeRead` j loop' i (j+1) (atvi + a j i * vj) -- multiply vector v by matrix A and then by matrix A transposed */ multiplyAtAv n v atav | n `seq` v `seq` atav `seq` False = undefined multiplyAtAv n v atav = do u <- newArray_ (0,n-1) :: IO (IOUArray Int Double) multiplyAv n v u >> multiplyAtv n u atav
6 Current entry
-- The Great Computer Language Shootout -- http:--shootout.alioth.debian.org/ -- -- Original C contributed by Sebastien Loisel -- Conversion to C++ by Jon Harrop -- Conversion to Haskell by Einar Karttunen import Control.Monad import Control.Monad.ST import Data.Array.Base import Data.Array.ST import Numeric import System eval_A :: Int -> Int -> Double eval_A i j = 1 / fromIntegral ((i+j)*(i+j+1) `div` 2 + i + 1) plusAt :: STUArray s Int Double -> Int -> Double -> ST s () plusAt a i v = do o <- unsafeRead a i unsafeWrite a i (v+o) eval_A_Times_u :: STUArray s Int Double -> STUArray s Int Double -> ST s () eval_A_Times_u u au = outer (snd $ bounds u) where outer 0 = unsafeWrite au 0 0 >> inner 0 (snd $ bounds u) outer i = unsafeWrite au i 0 >> inner i (snd $ bounds u) >> outer (i-1) inner i 0 = unsafeRead u 0 >>= \uj -> plusAt au i (eval_A i 0 * uj) inner i j = unsafeRead u j >>= \uj -> plusAt au i (eval_A i j * uj) >> inner i (j-1) eval_At_Times_u :: STUArray s Int Double -> STUArray s Int Double -> ST s () eval_At_Times_u u au = outer (snd $ bounds u) where outer 0 = unsafeWrite au 0 0 >> inner 0 (snd $ bounds u) outer i = unsafeWrite au i 0 >> inner i (snd $ bounds u) >> outer (i-1) inner i 0 = unsafeRead u 0 >>= \uj -> plusAt au i (eval_A 0 i * uj) inner i j = unsafeRead u j >>= \uj -> plusAt au i (eval_A j i * uj) >> inner i (j-1) eval_AtA_Times_u u v = do w <- newArray (bounds u) 0 eval_A_Times_u u w >> eval_At_Times_u w v main = do n <- getArgs >>= return.read.head let (vBv,vv) = runST (do u <- newArray (0,n-1) 1 v <- newArray (0,n-1) 0 replicateM_ 10 (eval_AtA_Times_u u v >> eval_AtA_Times_u v u) vLoop u v n (0, 0)) putStrLn $ showFFloat (Just 9) (sqrt (vBv/vv)) "" vLoop :: STUArray s Int Double -> STUArray s Int Double -> Int -> (Double,Double) -> ST s (Double,Double) vLoop u v 0 a = return a vLoop u v (i+1) (vBv,vv) = vLoop u v i =<< op where op = do ui <- unsafeRead u i vi <- unsafeRead v i return (vBv+(ui*vi),vv+(vi*vi))
7 STUArray-based solution
One-line-change convertible to IOUArray, which seems to be exactly as fast as STUArray. The important part seems to be the unboxed version of the 'a' function and the iterateM_ hack (which I don't quite know how it works at all).
Otherwise, it seems to be just about exactly as fast as the unsafe Ptr Double versions.
{-# OPTIONS_GHC -fglasgow-exts #-} -- -- The Computer Language Benchmarks Game -- http://shootout.alioth.debian.org/ -- -- Contributed by Simon Brenner -- -- Compile with: -- -- C-options: -- -fvia-c -optc-O3 -optc-march=native -optc-mfpmath=sse -optc-msse2 -- -optc-ffast-math -- GHC options: -- -O2 -fexcess-precision import Control.Monad.ST import Control.Monad import Data.Array.IO import Data.Array.ST import Data.Ix import Data.Bits import System.Environment -- Various magic used for the unboxed version of the 'a' function import GHC.Base import GHC.Float main = do n <- fmap (read . head) getArgs print (spectralNorm n) -- Use this variant for IOUArray rather than STUArray --print =<< spectralNormM allocDoubleIO n spectralNorm :: Int -> Double spectralNorm n = runST (spectralNormM allocDouble n) ----------------- -- Benchmark code spectralNormM allocator n = case allocator of (Allocator alloc) -> do let ubound = n-1 u <- alloc (0,ubound) v <- alloc (0,ubound) tmp <- alloc (0,ubound) fill u 1.0 replicateM_ 10 $ do multiplyAtAv ubound u tmp v multiplyAtAv ubound v tmp u vBv <- u .| v vv <- v .| v return $ sqrt (vBv / vv) multiplyAtAv n v u atav = do multiplyAv n v u multiplyAtv n u atav multiplyAv n v av = iterateM_ 0 n $ \i -> do writeArray av i 0.0 iterateM_ 0 n $ \j -> do v' <- readArray v j modArray av i (+ ((a i j) * v')) multiplyAtv n v atv = iterateM_ 0 n $ \i -> do writeArray atv i 0.0 iterateM_ 0 n $ \j -> do v' <- readArray v j modArray atv i (+ ((a j i) * v')) a :: Int -> Int -> Double --a i j = 1.0 / fromIntegral (((i + j) * (i + j + 1)) `shiftR` 1 + i + 1) --a i j = 1.0 / (((i' + j') * (i' + j' + 1) / 2) + i' + 1) -- where i' = fromIntegral i; j' = fromIntegral j a (I# i) (I# j) = D# ( case i +# j of n -> case n *# (n+#1#) of t -> case t `uncheckedIShiftRA#` 1# of u -> case u +# (i +# 1#) of r -> 1.0## /## (int2Double# r)) ----------------------- -- General utility code fill u x = getBounds u >>= \(lb,ub) -> forM_ [lb..ub] (\i -> writeArray u i x) u .| v = getBounds u >>= \(lb,ub) -> go lb ub u v 0 where go lb ub u v acc | lb <= ub = do u' <- readArray u lb v' <- readArray v lb go (lb+1) ub u v (acc + u'*v') | otherwise = return acc -- Explicitly inlining this seems to stand for about a 24x speedup :) {-# INLINE modArray #-} modArray a i f = writeArray a i . f =<< readArray a i -- This is really weird! Replacing iterateM_ x y with forM_ [x..y] in the loops -- above makes the program go twice as slowly. (Really!) {-# INLINE iterateM_ #-} {-# RULES "inline-iterateM_" iterateM_ = \lb ub f -> forM_ [lb..ub] f #-} iterateM_ lb ub = forM_ [lb..ub] data Allocator m i e = forall a. MArray a e m => Allocator ((i, i) -> m (a i e)) allocDouble :: Ix i => Allocator (ST s) i Double allocDouble = Allocator $ (newArray_ :: Ix i => (i,i) -> ST s (STUArray s i Double)) allocDoubleIO :: Ix i => Allocator IO i Double allocDoubleIO = Allocator $ (newArray_ :: Ix i => (i,i) -> IO (IOUArray i Double))
