Personal tools

Shootout/Knucleotide

From HaskellWiki

< Shootout(Difference between revisions)
Jump to: navigation, search
(General editing to get nearer to guidelines and improve tables.)
m
 
(6 intermediate revisions by 4 users not shown)
Line 10: Line 10:
 
* read line-by-line a redirected FASTA format file from stdin
 
* read line-by-line a redirected FASTA format file from stdin
 
* extract DNA sequence THREE
 
* extract DNA sequence THREE
* define a procedure/function to update a hashtable of k-nucleotide keys and count values, for a particular reading-frame even though we'll combine k-nucleotide counts for all reading-frames (grow the hashtable from a small default size)
+
* define a procedure/function to update a hashtable of k-nucleotide keys and count values, for a particular reading-frame - even though we'll combine k-nucleotide counts for all reading-frames (grow the hashtable from a small default size)
 
* write the code and percentage frequency, for all the 1-nucleotide and 2-nucleotide sequences, sorted by descending frequency and then ascending k-nucleotide key
 
* write the code and percentage frequency, for all the 1-nucleotide and 2-nucleotide sequences, sorted by descending frequency and then ascending k-nucleotide key
 
* count all the 3- 4- 6- 12- and 18-nucleotide sequences, and write the count and code for specific sequences
 
* count all the 3- 4- 6- 12- and 18-nucleotide sequences, and write the count and code for specific sequences
Line 64: Line 64:
 
N=250,000. Debian Linux/x86.
 
N=250,000. Debian Linux/x86.
 
{| border="1"
 
{| border="1"
| Brian Sniffen's Map #1 || 30.7s || 86M || -funbox-strict-fields || ||
+
| Brian Sniffen's Map #1 || 30.7s || 86M || -funbox-strict-fields ||
 
|-
 
|-
| Einar's original || 24.8s || 260M || || ||
+
| Einar's original || 24.8s || 260M || ||
 
|-
 
|-
| Generalised Trie #1 || 23.8s || 93M || +RTS -K32m || removed ||
+
| Generalised Trie #1 || 23.8s || 93M || +RTS -K32m || removed
 
|-
 
|-
| Brian Sniffen's Map #2 || 21.7s || 86M || -funbox-strict-fields || ||
+
| Brian Sniffen's Map #2 || 21.7s || 86M || -funbox-strict-fields ||
 
|-
 
|-
| Chris's current entry || 18.5s || 65M || ||
+
| Chris's current entry || 18.5s || 65M || ||
 
|-
 
|-
| Generalised Trie #2 || 13.8s ||100M || +RTS -K32m || removed ||
+
| Generalised Trie #2 || 13.8s ||100M || +RTS -K32m || removed
 
|-
 
|-
| Generalised Trie #2 || 4.1s ||100M || +RTS -K32m -H100m || removed ||
+
| Generalised Trie #2 || 4.1s ||100M || +RTS -K32m -H100m || removed
 
|-
 
|-
| Ketil's Trie || 3.6s || 76M || -funbox-strict-fields || ||
+
| Ketil's Trie || 3.6s || 76M || -funbox-strict-fields ||
 
|-
 
|-
| Generalised Trie w/ Assoc List #2 || 3.5s || 100M || -funbox-strict-fields || ||
+
| Generalised Trie w/ Assoc List #2 || 3.5s || 100M || -funbox-strict-fields ||
 
|}
 
|}
 
=== More benchmarks ===
 
=== More benchmarks ===
Line 88: Line 88:
 
It is important to add "+RTS -A100m -RTS" to all the entries. This would help the entry in the shootout, as well.
 
It is important to add "+RTS -A100m -RTS" to all the entries. This would help the entry in the shootout, as well.
 
{| border="1"
 
{| border="1"
| PROGRAM || Normal (GC%) || -A (GC%) ||
+
| PROGRAM || Normal (GC%) || -A (GC%)
 
|-
 
|-
| Data.Hashtable || 59.28s (41.6%) || 44.58s ( 9.7%) ||
+
| Data.Hashtable || 59.28s (41.6%) || 44.58s ( 9.7%)
 
|-
 
|-
| Brain Sniffen's Map, v2 || 66.67s (62.2%) || 30.48s (16.0%) ||
+
| Brain Sniffen's Map, v2 || 66.67s (62.2%) || 30.48s (16.0%)
 
|-
 
|-
| Ketil's Trie || 15.85s (84.0%) || 5.04s (43.7%) ||
+
| Ketil's Trie || 15.85s (84.0%) || 5.04s (43.7%)
 
|-
 
|-
| Ketil's Trie -A800m || - || 3.33s ( 0.0%) ||
+
| Ketil's Trie -A800m || - || 3.33s ( 0.0%)
 
|}
 
|}
  +
 
== On mutable data structures in Haskell ==
 
== On mutable data structures in Haskell ==
   
Line 164: Line 165:
 
total = fromIntegral (size - n + 1)
 
total = fromIntegral (size - n + 1)
   
kf (Node k x) (Node j y) = case compare y x of
+
kf (Node k x) (Node j y) = compare (y, ppr n k) (x, ppr n j)
EQ -> compare (ppr n k) (ppr n j); x -> x
 
   
 
writeFrame size p (n,k) = do
 
writeFrame size p (n,k) = do
Line 347: Line 348:
 
total = fromIntegral (size - n + 1)
 
total = fromIntegral (size - n + 1)
   
kf (k,x) (j,y) = case compare y x of
+
kf (k,x) (j,y) = compare (y, ppr n k) (x, ppr n j)
EQ -> compare (ppr n k) (ppr n j); x -> x
 
   
 
writeFrame (size,p) (n,k) = do
 
writeFrame (size,p) (n,k) = do
Line 429: Line 430:
   
 
{-# INLINE cmpmem #-}
 
{-# INLINE cmpmem #-}
cmpmem i ptr1 ptr2 = if i == 0 then return EQ else do
+
cmpmem 0 _ _ = EQ
  +
cmpmem i ptr1 ptr2 = do
 
cmp <- liftM2 compare (peek ptr1) (peek ptr2)
 
cmp <- liftM2 compare (peek ptr1) (peek ptr2)
 
case cmp of EQ -> cmpmem (pred i) (ptr1 `advancePtr` 1) (ptr2 `advancePtr` 1)
 
case cmp of EQ -> cmpmem (pred i) (ptr1 `advancePtr` 1) (ptr2 `advancePtr` 1)
Line 483: Line 484:
 
readUntil pred = do
 
readUntil pred = do
 
l <- getLine
 
l <- getLine
if pred l
+
unless (pred l)
then return ()
+
(readUntil pred)
else readUntil pred
 
   
 
skipComment = do
 
skipComment = do
Line 640: Line 641:
 
import System.IO
 
import System.IO
 
import Text.Printf(printf)
 
import Text.Printf(printf)
import Control.Monad(unless)
+
import Control.Monad(unless,zipWithM_)
 
import Data.List hiding (insert)
 
import Data.List hiding (insert)
 
import Data.Maybe(maybe,fromMaybe)
 
import Data.Maybe(maybe,fromMaybe)
Line 660: Line 661:
 
showFrequencies 1 t
 
showFrequencies 1 t
 
showFrequencies 2 t
 
showFrequencies 2 t
mapM_ putStrLn $ map (\(f,s)->(show f)++('\t':s)) $ zip (map (lookupFrequency t) seqStrings) seqStrings
+
zipWithM_ (\f s->putStrLn $ (show f)++('\t':s)) (map (lookupFrequency t) seqStrings) seqStrings
   
 
insert :: Trie -> String -> Trie
 
insert :: Trie -> String -> Trie
Line 684: Line 685:
 
convert (s,f) = (s,100 * fromIntegral f / tot)
 
convert (s,f) = (s,100 * fromIntegral f / tot)
 
mySort = sortBy freqAndKey
 
mySort = sortBy freqAndKey
where freqAndKey (k1,x) (k2,y) = case compare y x of
+
where freqAndKey (k1,x) (k2,y) = compare (y,k1) (x,k2)
EQ -> compare k1 k2
 
z -> z
 
 
in do mapM_ printBSF (mySort asPercent)
 
in do mapM_ printBSF (mySort asPercent)
 
putChar '\n'
 
putChar '\n'
Line 760: Line 761:
 
where convert (s,f) = (s,100 * fromIntegral f / tot)
 
where convert (s,f) = (s,100 * fromIntegral f / tot)
 
tot = fromIntegral ( sum $ map (tc.snd) t) :: Double
 
tot = fromIntegral ( sum $ map (tc.snd) t) :: Double
freqAndKey (k1,x) (k2,y) = case compare y x of
+
freqAndKey (k1,x) (k2,y) = compare (y,k1) (x,k2)
EQ -> compare k1 k2
 
z -> z
 
 
in do mapM_ printBSF (sortBy freqAndKey asPercent)
 
in do mapM_ printBSF (sortBy freqAndKey asPercent)
 
putChar '\n'
 
putChar '\n'
Line 833: Line 834:
 
</haskell>
 
</haskell>
   
== Current entry ==
+
== Previous current entry ==
   
 
This is an attempt to make the standard HashTable perform well enough. On my powerbook G4 it is slightly faster than the current shootout entry and slightly faster than Don's packed buffer below. It uses 43 MB of RSIZE. The winning [http://shootout.alioth.debian.org/gp4/benchmark.php?test=knucleotide&lang=gcc&id=0 c-code entry] uses a hash table at the shootout [http://cvs.alioth.debian.org/cgi-bin/cvsweb.cgi/shootout/bench/Include/?cvsroot=shootout site]. Perhaps we should FFI the same hash table code.
 
This is an attempt to make the standard HashTable perform well enough. On my powerbook G4 it is slightly faster than the current shootout entry and slightly faster than Don's packed buffer below. It uses 43 MB of RSIZE. The winning [http://shootout.alioth.debian.org/gp4/benchmark.php?test=knucleotide&lang=gcc&id=0 c-code entry] uses a hash table at the shootout [http://cvs.alioth.debian.org/cgi-bin/cvsweb.cgi/shootout/bench/Include/?cvsroot=shootout site]. Perhaps we should FFI the same hash table code.
Line 842: Line 843:
   
 
<haskell>
 
<haskell>
  +
{-# OPTIONS -fglasgow-exts -fbang-patterns -funbox-strict-fields #-}
  +
--
 
-- The Computer Language Shootout
 
-- The Computer Language Shootout
 
-- http://shootout.alioth.debian.org/
 
-- http://shootout.alioth.debian.org/
 
--
 
--
-- Created by Chris Kuklewicz and Don Stewart
+
-- Contributed by Don Stewart
--
+
-- Uses a port of the simple hashtable from the Clean entry
-- compile with: ghc -O3 -optc-O3 -fasm -fglasgow-exts knuc-1.hs -o knuc-1.ghc_run
 
 
--
 
--
-- run with: ./knuc-1.ghc_run +RTS -H40m -RTS %A
 
   
 
import GHC.Exts
 
import GHC.Exts
 
import GHC.IOBase
 
import GHC.IOBase
import Control.Monad
+
 
import Foreign
 
import Foreign
import Text.Printf(printf)
+
import Char
import Data.List(isPrefixOf,sortBy)
+
import List
import Data.Maybe(fromMaybe)
+
import Maybe
import Data.Char(ord,chr,toUpper)
+
import Text.Printf
import qualified Data.HashTable as T
 
   
searchStrings = ["GGT","GGTA","GGTATT","GGTATTTTAATT","GGTATTTTAATTTATAGT"]
+
import Data.ByteString.Internal
  +
import qualified Data.ByteString.Char8 as S
   
main = do section <- getSection ">THREE"
+
import Data.Array.Base
mapM_ (writeFreqs section) [1,2]
+
import qualified Data.Array.IO as A
mapM_ (writeFrame section) =<< mapM stringToSeq searchStrings
 
   
getSection prefix = do findPrefix
+
main = do
baseArray <- newArray0 0 =<< getRest =<< skipComments
+
(PS fp o l) <- get (S.pack ">TH")
size <- lengthArray0 0 baseArray
+
withForeignPtr fp $ \p -> do
return (size,baseArray)
+
let sec = p `plusPtr` o
where findPrefix = do line <- getLine; unless (isPrefixOf prefix line) findPrefix
+
mapM_ (writeFreqs l sec) [1,2]
skipComments = do line <- getLine
+
mapM_ (writeFrame l sec) =<< mapM toseq ["GGT","GGTA",
if ';' == head line then skipComments else return line
+
"GGTATT","GGTATTTTAATT",
getRest line = liftM asBases getContents
+
"GGTATTTTAATTTATAGT"]
where asBases = (concatMap c2b).(takeWhile (('>'/=).head)).(line:).lines
 
   
newTable :: Int -> IO (T.HashTable (Ptr Word8) Int)
+
get p = do
newTable frameSize = T.new (eqSeq frameSize) (hashSeq frameSize)
+
s <- S.getContents
  +
let Just n = S.findSubstring p s
  +
return $! S.map toUpper -- array fusion!
  +
. S.filter ((/=) '\n')
  +
. S.dropWhile ((/=) '\n')
  +
. S.copy
  +
. S.drop n $ s
   
-- (countFreq (size,bases)) satisfies the requirement to "define a
+
writeFreqs size p n = do
-- procedure/function to update a hashtable of k-nucleotide keys and
+
h <- htNew n size
-- count values, for a particular reading-frame"
+
htInsert size p n h
countFreq (size,bases) frameSize table = mapSeq >> return table
+
let vs = htNodes h
where mapSeq = mapM_ (countFrame . (advancePtr bases)) [0..(size-frameSize)]
+
mapM_ draw (sortBy kf vs)
countFrame frame = do mOld <- T.lookup table frame
+
putChar '\n'
T.update table frame $! maybe 1 succ mOld
+
where
  +
draw (Node p f) = printf "%s %.3f\n" (ppr n p) pct
  +
where pct = (100 * (fromIntegral f) / total) :: Double
  +
total = fromIntegral (size - n + 1)
   
writeFreqs sb@(size,_) frameSize = do
+
kf (Node k x) (Node j y) = compare (y, ppr n k) (x, ppr n j)
let printBSF (bs,f) = printf "%s %.3f\n" (showSeq frameSize bs) (percent f)
 
where percent n = (100 * (fromIntegral n) / total) :: Double
 
total = fromIntegral (size - frameSize + 1)
 
freqAndKey (k1,x) (k2,y) = case compare y x of
 
EQ -> compare (showSeq frameSize k1) (showSeq frameSize k2)
 
lt'gt -> lt'gt
 
unsorted <- T.toList =<< countFreq sb frameSize =<< newTable frameSize
 
mapM_ printBSF (sortBy freqAndKey unsorted) >> putChar '\n'
 
   
writeFrame sb@(size,_) (frameSize,frameSeq) = do
+
writeFrame size p (n,k) = do
mAnswer <- flip T.lookup frameSeq =<< countFreq sb frameSize =<< newTable frameSize
+
h <- htNew n size
putStrLn $ (show $ fromMaybe 0 mAnswer) ++ ('\t' : (showSeq frameSize frameSeq))
+
htInsert size p n h
  +
Node k v <- htFind k h
  +
putStrLn $ (show v) ++ ('\t' : ppr n k)
   
c2b = map (toEnum . ord . toUpper)
+
ppr n p = inlinePerformIO (map w2c `fmap` peekArray n p)
  +
toseq s = fmap ((,) (length s)) (newArray0 0 (map c2w s))
   
stringToSeq str = liftM ((,) (length str)) (newArray0 0 (c2b str))
+
------------------------------------------------------------------------
  +
--
  +
-- An implementation of simpl_hash.c in Haskell
  +
--
   
showSeq fs ptr = unsafePerformIO $ peekArray fs ptr >>= return.(map (chr . fromEnum))
+
data Hash = HT !Int !Int !(A.IOArray Int Buckets)
   
-- --
+
data Buckets = Empty | Bucket !(Ptr Word8) !Int | Buckets [Node]
-- -- Performance tweaked routines for (HashTable Seq Int)
 
-- --
 
   
{-# INLINE inlinePerformIO #-}
+
data Node = Node !(Ptr Word8) !Int
inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
 
   
hashSeq (I# frameSize) (Ptr ptr) = inlinePerformIO $ IO $ (\s -> hashmem frameSize ptr 0# s)
+
htNew :: Int -> Int -> IO Hash
  +
htNew !fl !sz = HT fl nprime `fmap` A.newArray (0,nprime-1) Empty
  +
where
  +
n = htSize fl sz
  +
nprime = head (dropWhile (< n) primes)
   
{-# INLINE hashmem #-}
+
htSize :: Int -> Int -> Int
hashmem remainingSize ptr runningHash s = if remainingSize ==# 0# then (# s, toEnum (I# runningHash) #)
+
htSize !fl !buflen = min lim (go (fl-1) 4)
else case readInt8OffAddr# ptr 0# s of { (# s, i8 #) ->
+
where
hashmem (remainingSize -# 1#) (plusAddr# ptr 1#) ((-1640531527#) *# runningHash +# i8) s }
+
lim = (buflen - fl) `div` 1024
  +
go !n !m | n > 0 && m < lim = go (n-1) (m*4)
  +
| otherwise = m
   
eqSeq (I# frameSize) (Ptr ptr1) (Ptr ptr2) = inlinePerformIO $ IO $ (\s -> eqmem frameSize ptr1 ptr2 s)
+
htInsert :: Int -> Ptr Word8 -> Int -> Hash -> IO ()
  +
htInsert !s !p n !h = mapM_ (htInc h . plusPtr p) [0..s-n]
   
{-# INLINE eqmem #-}
+
htInc :: Hash -> Ptr Word8 -> IO ()
eqmem remainingSize ptr1 ptr2 s = if remainingSize ==# 0# then (# s , True #)
+
htInc (HT n size arr) k =
else case readInt8OffAddr# ptr1 0# s of { (# s, i8a #) ->
+
case htHash size n k of
case readInt8OffAddr# ptr2 0# s of { (# s, i8b #) ->
+
!i -> do b <- unsafeRead arr i
if i8a /=# i8b then (# s, False #)
+
unsafeWrite arr i $! inc b
else eqmem (remainingSize -# 1#) (plusAddr# ptr1 1#) (plusAddr# ptr2 1#) s } }
+
where
  +
equal = eq n
  +
  +
inc :: Buckets -> Buckets
  +
inc (Bucket !k' !v)
  +
| k' `equal` k = Bucket k' (v+1)
  +
| otherwise = Buckets $ Node k' v : [Node k 1]
  +
inc (Buckets b) = Buckets $ incL b
  +
inc Empty = Bucket k 1
  +
  +
incL :: [Node] -> [Node]
  +
incL (!i@(Node k' v):ls)
  +
| k' `equal` k = Node k' (v+1) : ls
  +
| otherwise = i : incL ls
  +
incL [] = [Node k 1]
  +
  +
htNodes :: Hash -> [Node]
  +
htNodes (HT _ size arr) = items 0
  +
where
  +
read i = inlinePerformIO $! unsafeRead arr i
  +
  +
items !i | i >= size = []
  +
| otherwise = items_bucket (read i) (i+1)
  +
  +
items_bucket !(Bucket !k' !v) i = Node k' v : items i
  +
items_bucket !(Buckets !b) i = items_list b i
  +
items_bucket Empty !i = items i
  +
  +
items_list (!e:l) !i = e : items_list l i
  +
items_list [] !i = items i
  +
  +
htFind :: Ptr Word8 -> Hash -> IO Node
  +
htFind !k !(HT n size arr) = do
  +
let !i = htHash size n k
  +
v <- unsafeRead arr i
  +
return $! find v
  +
where
  +
equal = eq n
  +
  +
find (Bucket k' v) | k' `equal` k = Node k' v
  +
| otherwise = Node k 0
  +
find (Buckets l) = find' l
  +
find Empty = Node k 0
  +
  +
find' (i@(Node !k' _):ls) | k' `equal` k = i
  +
| otherwise = find' ls
  +
find' [] = Node k 0
  +
  +
htHash :: Int -> Int -> Ptr Word8 -> Int
  +
htHash (I# max) (I# size) (Ptr p) = abs . inlinePerformIO . IO $ go p 0#
  +
where
  +
lim = p `plusAddr#` size
  +
go p acc !s
  +
| p `geAddr#` lim = (# s, I# (acc `remInt#` max) #)
  +
| otherwise = case readInt8OffAddr# p 0# s of
  +
(# s, i #) -> go (p `plusAddr#` 1#) (5# *# acc +# i) s
  +
  +
-- A fast Ptr comparison for Hash keys
  +
eq :: Int -> Ptr Word8 -> Ptr Word8 -> Bool
  +
eq !n p q = inlinePerformIO $ do
  +
a <- peek p :: IO Word8
  +
b <- peek q :: IO Word8
  +
if a /= b then return False
  +
else go n p q
  +
where
  +
go !n !p !q
  +
| n == 0 = return True
  +
| otherwise = do
  +
a <- peek p :: IO Word8
  +
b <- peek q :: IO Word8
  +
if a /= b then return False
  +
else go (n-1) (p `plusPtr` 1) (q `plusPtr` 1)
  +
  +
primes = [ 53, 97, 193, 389, 769,
  +
1543, 3079, 6151, 12289, 24593,
  +
49157, 98317, 196613, 93241, 786433,
  +
1572869, 3145739, 6291469, 12582917, 25165843,
  +
50331653, 100663319, 201326611, 402653189, 805306457 ]
  +
  +
</haskell>
  +
  +
== Current entry ==
  +
  +
by Stephen Blackheath
  +
  +
This improves on Don Stewart's by speeding the hash table up a fair bit, adding parallelism, and it also complies better with the rules by implementing the required automatic growing of the hash table.
  +
  +
<haskell>
  +
{-# LANGUAGE BangPatterns #-}
  +
{-# OPTIONS_GHC -O2 #-}
  +
  +
-- The Computer Language Benchmarks Game
  +
-- http://shootout.alioth.debian.org/
  +
--
  +
-- contributed by Stephen Blackheath (with some bits taken from Don Stewart's
  +
-- version), v1.2
  +
  +
import Text.Printf
  +
import Data.ByteString.Internal
  +
import qualified Data.ByteString.Char8 as S
  +
import Control.Applicative
  +
import Control.Monad
  +
import Control.Concurrent
  +
import Foreign.Storable
  +
import Foreign.Marshal.Alloc
  +
import Foreign.Marshal.Array
  +
import Foreign.Ptr
  +
import Foreign.ForeignPtr
  +
import Data.Word
  +
import Data.Bits
  +
import Data.Char
  +
import Data.List
  +
import Data.Maybe
  +
import Data.IORef
  +
import GHC.Exts
  +
  +
  +
main = do
  +
genome <- extract (S.pack ">TH")
  +
let actions = [
  +
do
  +
a <- printFreqsBySize genome 1
  +
b <- printFreqsBySize genome 2
  +
return $ a ++ b
  +
] ++ map (printFreqsSpecific genome) specificSeqs
  +
output <- concat <$> parallel actions
  +
forM_ output putStrLn
  +
  +
-- Drop in replacement for sequence
  +
parallel :: [IO a] -> IO [a]
  +
parallel actions = do
  +
vars <- forM actions $ \action -> do
  +
var <- newEmptyMVar
  +
forkIO $ do
  +
answer <- action
  +
putMVar var answer
  +
return var
  +
forM vars takeMVar
  +
  +
specificSeqs = map S.pack [
  +
"GGT","GGTA","GGTATT","GGTATTTTAATT","GGTATTTTAATTTATAGT"]
  +
  +
extract p = do
  +
s <- S.getContents
  +
let (_, rem) = S.breakSubstring p s
  +
return $! S.map toUpper -- array fusion!
  +
. S.filter ((/=) '\n')
  +
. S.dropWhile ((/=) '\n') $ rem
  +
  +
printFreqsBySize :: S.ByteString -> Int -> IO [String]
  +
printFreqsBySize genome keySize = do
  +
ht0 <- htNew keySize
  +
ht <- hashGenome genome keySize ht0
  +
l <- htToList ht
  +
htFree ht
  +
return $ map draw (sortBy sortRule l) ++ [""]
  +
where
  +
genomeLen = S.length genome
  +
draw :: (S.ByteString, Int) -> String
  +
draw (key, count) = printf "%s %.3f" (S.unpack key) pct
  +
where pct = (100 * (fromIntegral count) / total) :: Double
  +
total = fromIntegral (genomeLen - keySize + 1)
  +
  +
printFreqsSpecific :: S.ByteString -> S.ByteString -> IO [String]
  +
printFreqsSpecific genome seq = do
  +
let keySize = S.length seq
  +
genomeLen = S.length genome
  +
ht0 <- htNew keySize
  +
ht <- hashGenome genome keySize ht0
  +
let (fp, offset, len) = toForeignPtr seq
  +
count <- withForeignPtr fp $ \p_ -> do
  +
htGet ht (p_ `plusPtr` offset)
  +
htFree ht
  +
return [show count ++ ('\t' : S.unpack seq)]
  +
  +
hashGenome :: S.ByteString
  +
-> Int
  +
-> Hashtable
  +
-> IO Hashtable
  +
{-# INLINE hashGenome #-}
  +
hashGenome genome keySize ht = do
  +
let (fp, offset, len) = toForeignPtr genome
  +
withForeignPtr fp $ \p_ -> do
  +
let p = p_ `plusPtr` offset
  +
loop ht idx = do
  +
let key = p `plusPtr` idx
  +
htInc ht key
  +
endIdx = len - keySize + 1
  +
foldMe i ht | i == endIdx = return ht
  +
foldMe i ht = loop ht i >>= foldMe (i+1)
  +
foldMe 0 ht
  +
  +
sortRule :: (S.ByteString, Int) -> (S.ByteString, Int) -> Ordering
  +
sortRule (a1, b1) (a2, b2) = compare (b2, a1) (b1, a2)
  +
  +
------ Hash table implementation ----------------------------------------------
  +
  +
-- Note: Hash tables are not generally used in functional languages, so there
  +
-- are no available library implementations for Haskell. This benchmark
  +
-- requires a hash table. This is why I have implemented the hash table here.
  +
  +
htNew :: Int -> IO Hashtable
  +
htNew = htNew' (head primes)
  +
  +
htNew' :: Int -> Int -> IO Hashtable
  +
htNew' slots ksz = do
  +
let ssz = spineSize ksz slots
  +
sp <- mallocBytes ssz
  +
memset sp 0 (fromIntegral ssz)
  +
return $ Hashtable {
  +
keySize = ksz,
  +
noOfSlots = slots,
  +
spine = sp
  +
}
  +
  +
primes = [ 1543, 3079, 6151, 12289, 24593,
  +
49157, 98317, 196613, 93241, 786433,
  +
1572869, 3145739, 6291469, 12582917, 25165843,
  +
50331653, 100663319, 201326611, 402653189, 805306457 ]
  +
  +
htFree :: Hashtable -> IO ()
  +
htFree ht = do
  +
htTraverse ht $ \isSpine (Entry ePtr) -> when (not isSpine) $ free ePtr
  +
free (spine ht)
  +
  +
htGet :: Hashtable -> Ptr Word8 -> IO Int
  +
htGet ht key = do
  +
hash <- calcHash ht key
  +
htPayload ht hash key >>= peek
  +
  +
htInc :: Hashtable -> Ptr Word8 -> IO Hashtable
  +
{-# INLINE htInc #-}
  +
htInc !ht !key = do
  +
hash <- calcHash ht key
  +
pPayload <- htPayload ht hash key
  +
value <- peek pPayload
  +
poke pPayload (value+1)
  +
if value == 0 && (hash .&. 0x7f) == 0
  +
then checkGrow ht
  +
else return ht
  +
  +
htPut_ :: Hashtable -> Ptr Word8 -> Int -> IO ()
  +
{-# INLINE htPut_ #-}
  +
htPut_ !ht !key !value = do
  +
hash <- calcHash ht key
  +
pPayload <- htPayload ht hash key
  +
poke pPayload value
  +
  +
checkGrow :: Hashtable -> IO Hashtable
  +
checkGrow ht = do
  +
let pTotal = totalEntriesOf ht
  +
slots = noOfSlots ht
  +
total <- (0x200+) <$> peek pTotal
  +
poke pTotal total
  +
if total >= slots
  +
then do
  +
let newSlots = head $ dropWhile (<= slots*2) primes
  +
ht' <- htNew' newSlots (keySize ht)
  +
htTraverse ht $ \_ -> reinsert ht' -- re-insert all the elts
  +
htFree ht
  +
poke (totalEntriesOf ht') total -- copy the total entry count
  +
return ht'
  +
else return ht
  +
where
  +
reinsert :: Hashtable -> Entry -> IO ()
  +
reinsert ht entry = do
  +
value <- peek (payloadOf entry)
  +
htPut_ ht (keyOf entry) value
  +
  +
htToList :: Hashtable -> IO [(S.ByteString, Int)]
  +
htToList ht =
  +
htMap (\entry -> do
  +
keyStr <- keyString ht (keyOf entry)
  +
payload <- peek (payloadOf entry)
  +
return (keyStr, payload)) ht
  +
  +
htMap :: (Entry -> IO a) -> Hashtable -> IO [a]
  +
htMap f ht = mapM f =<< htEntries ht
  +
  +
keyString :: Hashtable -> Ptr Word8 -> IO S.ByteString
  +
keyString ht key = S.pack . map w2c <$> peekArray (keySize ht) key
  +
  +
isEmptySlot :: Entry -> IO Bool
  +
{-# INLINE isEmptySlot #-}
  +
isEmptySlot entry = do
  +
ch <- peek $ keyOf entry
  +
return $ ch == 0
  +
  +
htEntries :: Hashtable -> IO [Entry]
  +
htEntries ht = do
  +
es <- newIORef []
  +
htTraverse ht $ \_ entry -> modifyIORef es $ \l -> entry:l
  +
readIORef es
  +
  +
htTraverse :: Hashtable -> (Bool -> Entry -> IO ()) -> IO ()
  +
htTraverse ht f = he 0
  +
where
  +
slots = noOfSlots ht
  +
he i | i == slots = return ()
  +
he i = do
  +
let entry = indexEntry ht i
  +
empty <- isEmptySlot entry
  +
if empty
  +
then he (i+1)
  +
else links True i entry
  +
links isSpine i entry = do
  +
next <- peek $ nextPtrOf entry
  +
f isSpine entry
  +
if next == nullPtr
  +
then he (i+1)
  +
else links False i (Entry next)
  +
  +
data Hashtable = Hashtable {
  +
keySize :: Int,
  +
noOfSlots :: Int,
  +
spine :: Ptr Word8
  +
}
  +
  +
wordSize :: Int
  +
wordSize = max (sizeOf (nullPtr :: Ptr Word8)) (sizeOf (0 :: Int))
  +
  +
-- Round up to word size
  +
roundUp :: Int -> Int
  +
{-# INLINE roundUp #-}
  +
roundUp !i = (i + wordSize - 1) .&. complement (wordSize - 1)
  +
  +
slotSize :: Int -> Int
  +
{-# INLINE slotSize #-}
  +
slotSize !ksz = roundUp ksz + wordSize * 2
  +
  +
spineSize :: Int -> Int -> Int
  +
spineSize ksz slots = slots * slotSize ksz + wordSize
  +
  +
calcHash :: Hashtable -> Ptr Word8 -> IO Int
  +
{-# INLINE calcHash #-}
  +
calcHash !ht !key = (`mod` noOfSlots ht) <$> ch 0 0
  +
where
  +
ksz = keySize ht
  +
ch :: Int -> Int -> IO Int
  +
ch !i !acc | i == ksz = return acc
  +
ch !i !acc = do
  +
c <- peek (key `plusPtr` i)
  +
ch (i+1) (acc * 131 + fromIntegral (c::Word8))
  +
  +
newtype Entry = Entry (Ptr Word8)
  +
  +
-- Count of the total number of hash table entries
  +
totalEntriesOf :: Hashtable -> Ptr Int
  +
{-# INLINE totalEntriesOf #-}
  +
totalEntriesOf ht = castPtr $ spine ht
  +
  +
indexEntry :: Hashtable -> Int -> Entry
  +
{-# INLINE indexEntry #-}
  +
indexEntry !ht !hash =
  +
let hOffset = wordSize + hash * slotSize (keySize ht)
  +
in Entry $ spine ht `plusPtr` hOffset
  +
  +
entryMatches :: Hashtable -> Entry -> Ptr Word8 -> IO Bool
  +
{-# INLINE entryMatches #-}
  +
entryMatches !ht !entry !key = do
  +
let eKey = keyOf entry
  +
c <- memcmp key eKey (fromIntegral $ keySize ht)
  +
if c == 0
  +
then return True
  +
else do
  +
empty <- isEmptySlot entry
  +
if empty
  +
then do
  +
memcpy eKey key (fromIntegral $ keySize ht) -- ick
  +
return True
  +
else
  +
return False
  +
  +
nextPtrOf :: Entry -> Ptr (Ptr Word8)
  +
{-# INLINE nextPtrOf #-}
  +
nextPtrOf !(Entry ePtr) = castPtr $ ePtr
  +
  +
payloadOf :: Entry -> Ptr Int
  +
{-# INLINE payloadOf #-}
  +
payloadOf !(Entry ePtr) = castPtr $ ePtr `plusPtr` wordSize
  +
  +
keyOf :: Entry -> Ptr Word8
  +
{-# INLINE keyOf #-}
  +
keyOf !(Entry ePtr) = ePtr `plusPtr` (wordSize*2)
  +
  +
allocEntry :: Hashtable -> Ptr Word8 -> IO Entry
  +
allocEntry !ht !key = do
  +
let esz = slotSize $ keySize ht
  +
ePtr <- mallocBytes esz
  +
memset ePtr 0 (fromIntegral esz)
  +
let entry = Entry ePtr
  +
memcpy (keyOf entry) key (fromIntegral $ keySize ht)
  +
return entry
  +
  +
htPayload :: Hashtable -> Int -> Ptr Word8 -> IO (Ptr Int)
  +
{-# INLINE htPayload #-}
  +
htPayload !ht !hash !key = do
  +
entry <- findEntry (indexEntry ht hash)
  +
return $ payloadOf entry
  +
where
  +
findEntry :: Entry -> IO Entry
  +
findEntry !entry = do
  +
match <- entryMatches ht entry key
  +
if match
  +
then
  +
return entry
  +
else do
  +
let pNext = nextPtrOf entry
  +
next <- peek pNext
  +
if next == nullPtr
  +
then do
  +
newEntry@(Entry ePtr) <- allocEntry ht key
  +
poke pNext ePtr
  +
return newEntry
  +
else
  +
findEntry (Entry next)
 
</haskell>
 
</haskell>
   
Line 1,012: Line 1,014:
 
return $! M.findWithDefault 0 (P p sl) m
 
return $! M.findWithDefault 0 (P p sl) m
   
putStr ((show c) ++ "\t") >> hPutBuf stdout p sl >> putChar '\n'
+
putStr ((show c) ++ "\t")
  +
hPutBuf stdout p sl
  +
putChar '\n'
   
 
-- update a hash of k-nucleotide strings to their frequency
 
-- update a hash of k-nucleotide strings to their frequency

Latest revision as of 22:25, 18 February 2010

This is a ShootoutEntry for [1].

[This page is completely unreadable. Would it make sense to put the programs into their own sub-pages?] [ Personally, I agree - Perhaps the original writers would care to do so? BrettGiles 02:24, 15 February 2007 (UTC)]

About the k-nucleotide benchmark

Each program should

  • read line-by-line a redirected FASTA format file from stdin
  • extract DNA sequence THREE
  • define a procedure/function to update a hashtable of k-nucleotide keys and count values, for a particular reading-frame - even though we'll combine k-nucleotide counts for all reading-frames (grow the hashtable from a small default size)
  • write the code and percentage frequency, for all the 1-nucleotide and 2-nucleotide sequences, sorted by descending frequency and then ascending k-nucleotide key
  • count all the 3- 4- 6- 12- and 18-nucleotide sequences, and write the count and code for specific sequences

We use FASTA files generated by the fasta benchmark as input for this benchmark. Note: the file may include both lowercase and uppercase codes.

Correct output for this 100KB input file (generated with the fasta program N = 10000), is

   A 30.284
   T 29.796
   C 20.312
   G 19.608
   AA 9.212
   AT 8.950
   TT 8.948
   TA 8.936
   CA 6.166
   CT 6.100
   AC 6.086
   TC 6.042
   AG 6.036
   GA 5.968
   TG 5.868
   GT 5.798
   CC 4.140
   GC 4.044
   CG 3.906
   GG 3.798
   562     GGT
   152     GGTA
   15      GGTATT
   0       GGTATTTTAATT
   0       GGTATTTTAATTTATAGT

In practice, less brute-force would be used to calculate k-nucleotide frequencies, for example Virus Classification using k-nucleotide Frequencies and A Fast Algorithm for the Exhaustive Analysis of 12-Nucleotide-Long DNA Sequences. Applications to Human Genomics (105KB pdf).

This shootout benchmark has really nailed a deficiency in the standard libraries. We could work around not having fast ascii strings with a few extra functions, but not having a fast associative data structure is another matter. We cannot win this one on speed; this entry's approach cannot win this on lines of code; and memory usage is dominated by the data. That is why I feel the main goal is making a new entry that reads well and avoids the memory leak of the old entry. -- ChrisKuklewicz

Contents

[edit] 1 Plan

Ok, where do we go from here:

 * Can we write a legal Map-based version of Ketil's Trie somehow?

-- DonStewart

I converted it naively and you need to benchmark it, see sections 5.1 and 5.2 for map and association list version. -- ChrisKuklewicz

[edit] 2 Benchmarks

We must test all entries on N=250k, as that is what they use in the shootout

N=250,000. Debian Linux/x86.

Brian Sniffen's Map #1 30.7s 86M -funbox-strict-fields
Einar's original 24.8s 260M
Generalised Trie #1 23.8s 93M +RTS -K32m removed
Brian Sniffen's Map #2 21.7s 86M -funbox-strict-fields
Chris's current entry 18.5s 65M
Generalised Trie #2 13.8s 100M +RTS -K32m removed
Generalised Trie #2 4.1s 100M +RTS -K32m -H100m removed
Ketil's Trie 3.6s 76M -funbox-strict-fields
Generalised Trie w/ Assoc List #2 3.5s 100M -funbox-strict-fields

[edit] 2.1 More benchmarks

On a powerbook G4:

It is important to add "+RTS -A100m -RTS" to all the entries. This would help the entry in the shootout, as well.

PROGRAM Normal (GC%) -A (GC%)
Data.Hashtable 59.28s (41.6%) 44.58s ( 9.7%)
Brain Sniffen's Map, v2 66.67s (62.2%) 30.48s (16.0%)
Ketil's Trie 15.85s (84.0%) 5.04s (43.7%)
Ketil's Trie -A800m - 3.33s ( 0.0%)

[edit] 3 On mutable data structures in Haskell

This benchmark is completely bottlenecked by Data.Hashtable (in GHC 6.4.1) and this discussion is based on the assumption that I am using Hashtable optimally. I downloaded the GHD 0.17 compiler and the DMD entry to benchmark on my machine. The DMD entry uses the "associative arrays" built into the language: "int[char[]] frequencies" and places second (runs in 3.0s). The winning entry is interesting, since the c-code does not have a hash table, and so it uses #include "../../Include/simple_hash.h" which pulls in a dead simple, inline, string to int hashtable and runs in 2s.

The entry below runs 16 slower than the DMD entry on my powerbook G4. Profiling shows the bottleneck. I downloaded simple_hash.h and shamelessly optimized it to replace Data.Hashtable for exactly this benchmark code. This sped up the proposed entry by a factor of 4.1 so that it is now about a factor of 4 slower than the DMD entry. This shows that Data.Hashtable is doing *at least* four times more work that is needed for this usage pattern. And even with my over specialized hash table, Haskell is almost 50% slower than OCaml's "module H = Hashtbl.Make(S)" (note that I my code uses the same hash function as OCaml). Unfortunately I cannot submit this optimized hash table entry to the shootout.

The only mutable data structure that come with GHC besides arrays is Data.Hashtable, which is not comptetitive with OCaml Hashtbl or DMD's associative arrays (unless there is something great hidden under Data.Graph). Is there any hope for GHC 6.6? Does anyone have pointers to an existing library at all? Perl and Python and Lua also have excellent built in hashtable capabilities. Where is a good library for Haskell?

[edit] 4 Custom HashTable + ByteStrings

Wipes the other HashTable in Haskell off the map.

{-# OPTIONS -fglasgow-exts -fbang-patterns -funbox-strict-fields #-}
--
-- The Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Contributed by Don Stewart
-- Uses a port of the simple hashtable from the Clean entry
--
 
import GHC.Exts
import GHC.IOBase
 
import Foreign
import Char
import List
import Maybe
import Text.Printf
 
import Data.ByteString.Base
import qualified Data.ByteString.Char8 as S
 
import Data.Array.Base
import qualified Data.Array.IO as A
 
main = do
    (PS fp o l) <- get (S.pack ">TH")
    withForeignPtr fp $ \p -> do
        let sec = p `plusPtr` o
        mapM_ (writeFreqs l sec) [1,2]
        mapM_ (writeFrame l sec) =<< mapM toseq strs
 
strs = ["GGT","GGTA","GGTATT","GGTATTTTAATT","GGTATTTTAATTTATAGT"]
 
get p = do
    s <- S.getContents
    let Just n = S.findSubstring p s
    return $! S.map toUpper             -- array fusion!
            . S.filter    ((/=) '\n')
            . S.dropWhile ((/=) '\n')
            . S.copy
            . S.drop n $ s
 
writeFreqs size p n = do
    h   <- htNew n size
    htInsert size p n h
    let vs = htNodes h
    mapM_ draw (sortBy kf vs)
    putChar '\n'
  where
    draw (Node p f) = printf "%s %.3f\n" (ppr n p) pct
        where pct   = (100 * (fromIntegral f) / total) :: Double
              total = fromIntegral (size - n + 1)
 
    kf (Node k x) (Node j y) = compare (y, ppr n k) (x, ppr n j)
 
writeFrame size p (n,k) = do
    h <- htNew n size
    htInsert size p n h
    Node k v <- htFind k h
    putStrLn $ (show v) ++ ('\t' : ppr n k)
 
ppr n p = inlinePerformIO (map w2c `fmap` peekArray n p)
toseq s = fmap ((,) (length s)) (newArray0 0 (map c2w s))
 
------------------------------------------------------------------------
--
-- An implementation of simpl_hash.c in Haskell
--
 
data Hash    = HT !Int !Int !(A.IOArray Int Buckets)
 
data Buckets = Empty | Bucket !(Ptr Word8) !Int | Buckets [Node]
 
data Node    = Node !(Ptr Word8) !Int
 
htNew :: Int -> Int -> IO Hash
htNew !fl !sz = HT fl nprime `fmap` A.newArray (0,nprime-1) Empty
  where
    n      = htSize fl sz
    nprime = head (dropWhile (< n) primes)
 
htSize :: Int -> Int -> Int
htSize !fl !buflen = min lim (go (fl-1) 4)
  where
    lim = (buflen - fl) `div` 1024
    go !n !m | n > 0 && m < lim      = go (n-1) (m*4)
             | otherwise             = m
 
htInsert :: Int -> Ptr Word8 -> Int -> Hash -> IO ()
htInsert !s !p n !h = mapM_ (htInc h . plusPtr p) [0..s-n]
 
htInc :: Hash -> Ptr Word8 -> IO ()
htInc ht@(HT n size arr) k  =
    case htHash size n k of
        !i -> do b <- unsafeRead arr i
                 unsafeWrite arr i $! inc b
  where
    equal = eq n
 
    inc :: Buckets -> Buckets
    inc (Bucket !k' !v)
        | k' `equal` k = Bucket  k' (v+1)
        | otherwise    = Buckets $ Node k' v : [Node k 1]
    inc (Buckets b)    = Buckets $ incL b
    inc Empty          = Bucket k 1
 
    incL :: [Node] -> [Node]
    incL (!i@(Node k' v):ls)
        | k' `equal` k = Node k' (v+1) : ls
        | otherwise    = i : incL ls
    incL []            = [Node k 1]
 
htNodes :: Hash -> [Node]
htNodes ht@(HT n size arr) = items 0
  where
    read i = inlinePerformIO $! unsafeRead arr i
 
    items !i | i >= size = []
             | otherwise = items_bucket (read i) (i+1)
 
    items_bucket !(Bucket !k' !v) i = Node k' v : items i
    items_bucket !(Buckets !b) i    = items_list b i
    items_bucket Empty        !i    = items i
 
    items_list (!e:l) !i = e : items_list l i
    items_list []     !i = items i
 
htFind :: Ptr Word8 -> Hash -> IO Node
htFind !k !h@(HT n size arr) = do
    let !i = htHash size n k
    v <- unsafeRead arr i
    return $! find v
  where
    equal = eq n
 
    find  (Bucket k' v) | k' `equal` k = Node k' v
                        | otherwise    = Node k  0
    find  (Buckets l)   = find' l
    find  Empty         = Node k 0
 
    find' (i@(Node !k' v):ls) | k' `equal` k = i
                              | otherwise    = find' ls
    find' []           = Node k 0
 
htHash :: Int -> Int -> Ptr Word8 -> Int
htHash (I# max) (I# size) ptr@(Ptr p) = abs . inlinePerformIO . IO $ go p 0#
  where
    lim = p `plusAddr#` size
    go p acc !s
        | p `geAddr#` lim = (# s, I# (acc `remInt#` max) #)
        | otherwise       = case readInt8OffAddr# p 0# s of
                (# s, i #) -> go (p `plusAddr#` 1#) (5# *# acc +# i) s
 
-- A fast Ptr comparison for Hash keys
eq !n p q = inlinePerformIO $ do
    a <- peek p :: IO Word8
    b <- peek q :: IO Word8
    if a /= b then return False
              else go n p q
  where
    go !n !p !q
        | n == 0    = return True
        | otherwise = do
            a <- peek p :: IO Word8
            b <- peek q :: IO Word8
            if a /= b then return False
                      else go (n-1) (p `plusPtr` 1) (q `plusPtr` 1)
 
primes = [ 53,       97,        193,       389,       769,
           1543,     3079,      6151,      12289,     24593,
           49157,    98317,     196613,    93241,     786433,
           1572869,  3145739,   6291469,   12582917,  25165843,
           50331653, 100663319, 201326611, 402653189, 805306457 ]


[edit] 5 HashTable + ByteStrings

{-# OPTIONS -fbang-patterns #-}
--
-- The Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Chris Kuklewicz and Don Stewart
--
 
import Char
import Foreign
import List
import Maybe
import Text.Printf
import GHC.Exts
import GHC.Int
import GHC.IOBase
import Data.ByteString.Base
import qualified Data.ByteString.Char8 as S
import qualified Data.HashTable as T
 
main = do
    (PS fp o l) <- get (S.pack ">TH")
    withForeignPtr fp $ \p -> do
        let sec = (l, p `plusPtr` o)
        mapM_ (writeFreqs sec) [1,2]
        mapM_ (writeFrame sec) =<< mapM toseq strs
 
strs = ["GGT","GGTA","GGTATT","GGTATTTTAATT","GGTATTTTAATTTATAGT"]
 
get p = do
    s <- S.getContents
    let Just n = S.findSubstring p s
    return $! S.map toUpper             -- array fusion!
            . S.filter    ((/=) '\n')
            . S.dropWhile ((/=) '\n')
            . S.copy
            . S.drop n $ s
 
count :: Int -> Ptr Word8 -> Int -> Hash -> IO ()
count s !p n !h = mapM_ (inc . plusPtr p) [0..s-n]
  where
    inc !k = do
        mold <- T.lookup h k
        case mold of
            Nothing -> T.insert h k 1
            Just n  -> do T.update h k $! n+1 ; return ()
 
writeFreqs (size,p) n = do
    h   <- newH n
    count size p n h
    mapM_ draw . sortBy kf =<< T.toList h
    putChar '\n'
  where
    draw (p,f) = printf "%s %.3f\n" (ppr n p) pct
        where pct   = (100 * (fromIntegral f) / total) :: Double
              total = fromIntegral (size - n + 1)
 
    kf (k,x) (j,y) = compare (y, ppr n k) (x, ppr n j)
 
writeFrame (size,p) (n,k) = do
  h <- newH n
  count size p n h
  v <- T.lookup h k
  putStrLn $ (show $ fromMaybe 0 v) ++ ('\t' : ppr n k)
 
toseq s = fmap ((,) (length s)) (newArray0 0 (map c2w s))
ppr n p = inlinePerformIO (map w2c `fmap` peekArray n p)
 
------------------------------------------------------------------------
 
type Hash = T.HashTable (Ptr Word8) Int
 
newH :: Int -> IO Hash
newH n = T.new (eq n) (hash n)
 
hash n (Ptr p) = inlinePerformIO $ IO $ go n 0# p
  where
    go !n acc p s
        | n == 0    = (# s, I32# acc #)
        | otherwise = case readInt8OffAddr# p 0# s of
                (# s, i #) -> go (n-1) (5# *# acc +# i) (plusAddr# p 1#) s
 
-- Faster than a memcmp!
eq !n (Ptr p) (Ptr q) = inlinePerformIO $ IO $ go n p q
  where
    go !n p q s
        | n == 0    = (# s , True #)
        | otherwise = case readInt8OffAddr# p 0# s of
                (# s, a #) -> case readInt8OffAddr# q 0# s of
                    (# s, b #) | a /=# b   -> (# s, False #)
                               | otherwise -> go (n-1) (plusAddr# p 1#) (plusAddr# q 1#) s

[edit] 6 Data.Map #1

I built the following first using naive IO, and very closely following the OCaml implementation. It ran it about 85 seconds. I then lifted the Seq and advancePtr code from Chris and Don's "Haskell #2" entry. It now runs in under 4 seconds on my machine, faster than anything but the C and D entries. Note that "updateCount" is there to satisfy the shootout's demand for an updater function. --BrianSniffen

Note: don't submit entries with {- -} style comments. They aren't lexed correctly on the shootout, and end up contributing towards our line count score. Use -- comments only -- DonStewart

-- new-knucleotide.hs
--
-- The Great Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- By Brian Sniffen, Chris Kuklewicz, and Don Stewart
-- 
 
import Data.Map as Map (Map, empty, insertWith, fold, foldWithKey, findWithDefault)
import Control.Monad
import System.IO
import Data.List (map,sort,isPrefixOf)
import Data.Char (ord,chr,toUpper)
 
import Foreign
 
import Text.Printf      (printf)
 
 
import GHC.Exts
import GHC.IOBase
 
type Base = Word8
c2b :: Char -> Base = fromIntegral . ord . toUpper
b2c :: Base -> Char = chr . fromIntegral
putWord8s = putStr . map b2c
 
-- The ptr are usually into the main fasta data, which is read-only
data Seq = Seq !Int !(Ptr Base) deriving Eq
 
instance Ord Seq where
  compare (Seq size1 ptr1) (Seq size2 ptr2) = case compare size1 size2 of
    EQ -> inlinePerformIO $ cmpmem size1 ptr1 ptr2
    z  -> z
 
{-# INLINE cmpmem #-}
cmpmem 0 _ _ = EQ
cmpmem i ptr1 ptr2 = do
    cmp <- liftM2 compare (peek ptr1) (peek ptr2)
    case cmp of EQ -> cmpmem (pred i) (ptr1 `advancePtr` 1) (ptr2 `advancePtr` 1)
                z  -> return z
 
 
seqStrings = ["GGT","GGTA","GGTATT","GGTATTTTAATT","GGTATTTTAATTTATAGT"]
 
 
substring s start len = take len . drop start $ s
 
 
-- [counts count k dna] fills the hashtable [count] of
-- k-nucleotide keys and count values for a particular reading-frame
-- of length [k] of the string [dna].
counts :: Int -> Seq -> Map Seq Int
counts = updateCounts Map.empty
updateCounts base k (Seq size dna) = 
  foldl' countFrame base [0..(size - k)] 
    where
      countFrame j i = increment (Seq k (advancePtr dna i)) j
 
increment k = Map.insertWith (+) k 1
 
-- [writeFrequencies count k dna] writes the frequencies for a
-- reading-frame of length [k] sorted by descending frequency and then
-- ascending k-nucleotide key. -}
percentConcat :: Int -> Seq -> Int -> [(Float,Seq)] -> [(Float, Seq)]
percentConcat tot k n l = (100 * (fromIntegral n) / (fromIntegral tot), k) : l
 
writeFrequencies :: Int -> Seq -> IO ()
writeFrequencies k dna = do
  let count = counts k dna
      tot = Map.fold (+) 0 count
      frq = Map.foldWithKey (percentConcat tot)
                            ([] :: [(Float,Seq)])
                            count
  mapM_ writeFreqRecord . reverse . sort $ frq
  putStrLn ""
 
writeFreqRecord :: (Float,Seq) -> IO ()
writeFreqRecord (f,k) = do
  printf "%s %.3f\n" (showSeq k) f
 
writeCount dna seq@(Seq len bases) = do
  mapM_ putStr [(show c), "\t", showSeq seq]
  putStrLn ""
      where 
        c = Map.findWithDefault 0 seq (counts len dna)
 
isThree = isPrefixOf ">THREE "
 
readUntil pred = do
  l <- getLine
  unless (pred l)
   (readUntil pred)
 
skipComment = do
  line <- getLine
  case line of
    ';':_ -> skipComment
    _     -> return (map toUpper line)
 
readSequence acc = do
  eof <- isEOF
  if eof
   then return (reverse acc)
   else do
     line <- getLine
     case line of
       '>':_ -> return (reverse acc)
       _     -> readSequence (map toUpper line : acc)
 
-- Routines to convert strings to Seq and back
stringToSeq str = do
    b <- newArray0 0 (map c2b str) 
    s <- lengthArray0 0 b 
    return (Seq s b)
 
showSeq (Seq size ptr) = inlinePerformIO $ do
  peekArray size ptr >>= return . (map b2c)
 
{-# INLINE inlinePerformIO #-}
inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
 
 
-- Extract DNA sequence "THREE" from stdin -}
dnaThree = do
  readUntil isThree
  firstLine <- skipComment
  otherLines <- readSequence []
  let blob = concatMap (map c2b) $ firstLine : otherLines
  baseArray <- newArray0 0 blob
  size <- lengthArray0 0 baseArray
  return (Seq size baseArray)
 
 
main = do
  contents <- dnaThree
  writeFrequencies 1 contents 
  writeFrequencies 2 contents 
  tests <- mapM stringToSeq seqStrings
  mapM_ (writeCount contents) tests

[edit] 6.1 Data.Map #2

The problem with version 1 was the foldr in updateCounts. Changing to foldl' solves that. But this replacement for updateCounts makes it faster.

Compiling with "ghc --make -O3 -optc-O3 -fglasgow-exts -funbox-strict-fields -fasm" on G4.

-- Tweak to Brian Sniffen's Data.Map version by Chris Kuklewicz
 
import Control.Monad.ST
import Data.STRef
 
updateCounts :: Map Seq Int -> Int -> Seq -> Map Seq Int
updateCounts base k seq@(Seq size dna) = runST (do
 
  let toRef mapST (key,value) = do ref <- newSTRef value
                                   return $ Map.insert key ref mapST
 
      fromRef (key,ref) = readSTRef ref >>= (\value -> return (key,value))
 
  baseST <- foldM toRef Map.empty (Map.toAscList base) 
 
  let countframe :: Map Seq (STRef s Int) -> Int -> 
              ST s (Map Seq (STRef s Int))
      countframe mapST offset = do
        let key = Seq k (advancePtr dna offset)
        case Map.lookup key mapST of
          Nothing  -> newSTRef 1 >>= (\ref -> return $ Map.insert key ref mapST)
          Just ref -> modifySTRef ref succ >> return mapST
 
  mapST <- foldM countframe baseST [0..size-k]
 
  mapM fromRef (Map.toAscList mapST) >>= return . (Map.fromAscList)
  )

[edit] 6.2 Data.Map #3 (ByteString)

I cobbled this together based on some other entries. It could be a little clearer and certainly a little faster, but is probably a good starting point. It runs faster on the file that I have than any of the other versions I benchmarked.

import Data.Char
import qualified Data.Map as Map
import Data.List
import Text.Printf(printf)
import qualified Data.ByteString.Char8 as B
import Data.Ord (comparing)
 
type FreqMap = Map.Map B.ByteString Int
 
loadMap :: Int -> [B.ByteString] -> FreqMap
loadMap i s = foldl' (\m w -> Map.insertWith (+) w 1 m) Map.empty snips
 where snips = filter (not . B.null ) $ map (B.take i) s
 
writeFrequencies i dna = 
  let mp = loadMap i dna
      total = fromIntegral (Map.fold (+) 0 mp ) / 100 :: Double
      res = map (\(a,b) -> (a, fromIntegral b / total)) (Map.toAscList mp)
      in mapM_ showFun . sortBy (flip (comparing snd)) $ res
 
showFun :: (B.ByteString, Double) -> IO ()
showFun (k,f) = printf "%s %.3f\n" (B.unpack k) f
 
writeCount dna sq = printf "%d\t%s\n" cnt (B.unpack sq)
      where cnt = length $ filter (B.isPrefixOf sq) dna
 
dnaThree :: IO [B.ByteString]
dnaThree = process =<< B.getContents
    where process     = return . B.tails . ul . takeNorm . tail . dropComment . dropOther . B.lines 
          dropOther   = dropWhile (\str -> not ((B.pack ">THREE") `B.isPrefixOf` str))
          dropComment = dropWhile ((';' ==) . B.head)
          takeNorm    = takeWhile (('>' /=) . B.head) 
          ul          = B.map toUpper . B.concat 
 
main = do three <- dnaThree
          writeFrequencies 1 three >> putStrLn ""
          writeFrequencies 2 three >> putStrLn ""
          mapM_ (writeCount three . B.pack) ["GGT", "GGTA", "GGTATT", "GGTATTTTAATT", "GGTATTTTAATTTATAGT"]

[edit] 7 KetilMalde : Trie-based entry

This entry uses a lazy suffix trie (loosely based on Kurtz/Giegerich: "A comparison of imperative and purely functional suffix tree constructions").

I guess this is considered cheating, since it is the easy and natural way to do it :-) While the specifications say "hashtable", I interpret it to mean "standard associative data structure" - so using Data.Map would be okay, and perhaps even the true Haskell way. The reason I still consider this entry cheating, is that it relies on lazyness to compute only the requested frequencies, instead of, as the benchmark specifies calculation of -- and subsequent discarding -- all frequencies of the given word lengths.

If I were a mean spirited person (and I am not, no, really), I would point this out as yet another benchmark artificially penalizing a language where it is easy and natural to avoid unnecessary computation. As it is, this can perhaps go in the "Interesting Alternatives" category (as Chris points out).

Note that most of the time is now spent parsing input, if somebody wanted to further improve it, using FPS or similar would be the way to go.

In my opinion, we can always exploit lazyness. If a spec requires us to compute something that isn't used, we should just let our language take care of this. The spec can always be changed if they want to explicitly penalise lazy languages (which I don't think they do -- they just don't think of the alternatives). So, lazyness is definitely ok. -- Don

-- By KetilMalde and ChrisKuklewicz
 
import System.IO
import Text.Printf(printf)
import Control.Monad(unless,zipWithM_)
import Data.List hiding (insert)
import Data.Maybe(maybe,fromMaybe)
import Data.Char(ord,chr,toUpper)
 
seqStrings = ["GGT","GGTA","GGTATT","GGTATTTTAATT","GGTATTTTAATTTATAGT"]
 
data Trie = T { as, cs, gs, ts, ns :: Trie,
                acount, ccount, gcount, tcount, ncount :: !Int }
          | Nil deriving (Eq,Show)
 
total (T _ _ _ _ _ ac cc gc tc nc) = ac+cc+gc+tc+nc
 
emptyT = T Nil Nil Nil Nil Nil 0 0 0 0 0
 
main = do all <- getContents
          let three = getSection ">THREE" all
              t = foldl' insert emptyT (tails three)
          showFrequencies 1 t
          showFrequencies 2 t
          zipWithM_ (\f s->putStrLn $ (show f)++('\t':s)) (map (lookupFrequency t) seqStrings) seqStrings
 
insert :: Trie -> String -> Trie
insert t [] = t
insert Nil (s:ss) = case s of 
    'A' -> emptyT {as = insert emptyT ss, acount = 1}
    'C' -> emptyT {cs = insert emptyT ss, ccount = 1}
    'G' -> emptyT {gs = insert emptyT ss, gcount = 1}
    'T' -> emptyT {ts = insert emptyT ss, tcount = 1}
    _   -> emptyT {ns = insert emptyT ss, ncount = 1}
insert t (s:ss) = case s of 
    'A' -> t { as = insert (as t) ss, acount = 1+(acount t) }
    'C' -> t { cs = insert (cs t) ss, ccount = 1+(ccount t) }
    'G' -> t { gs = insert (gs t) ss, gcount = 1+(gcount t) }
    'T' -> t { ts = insert (ts t) ss, tcount = 1+(tcount t) }
    _   -> t { ns = insert (ns t) ss, ncount = 1+(ncount t) }
 
showFrequencies k t = 
  let printBSF (bs,f) = printf "%s %.3f\n" bs f
      asPercent = map convert hist
          where tot = fromIntegral (total t) :: Double
                hist = writeFrequencies k t
                convert (s,f) = (s,100 * fromIntegral f / tot)
      mySort = sortBy freqAndKey
        where freqAndKey (k1,x) (k2,y) = compare (y,k1) (x,k2)
  in do mapM_ printBSF (mySort asPercent)
        putChar '\n'
 
writeFrequencies :: Int -> Trie -> [(String,Int)]
writeFrequencies _ Nil = []
writeFrequencies 1 (T _ _ _ _ _ ac cc gc tc nc) = zip (map (:[]) "ACGT") [ac,cc,gc,tc]
writeFrequencies k (T as cs gs ts _ ac cc gc tc _) = concatMap freq "ACGT"
  where k' = k-1
        mapc c = map (\(s,f)->(c:s,f))
        freq :: Char -> [(String,Int)]
        freq 'A' = if ac>0 then mapc ('A') (writeFrequencies k' as) else []
        freq 'C' = if cc>0 then mapc ('C') (writeFrequencies k' cs) else []
        freq 'G' = if gc>0 then mapc ('G') (writeFrequencies k' gs) else []
        freq 'T' = if tc>0 then mapc ('T') (writeFrequencies k' ts) else []
 
lookupFrequency :: Trie -> String -> Int
lookupFrequency _ [] = 0
lookupFrequency Nil _ = 0
lookupFrequency (T _ _ _ _ _ ac cc gc tc nc) (s:[]) = case s of
     'A' -> ac; 'C' -> cc; 'G' -> gc; 'T' -> tc; _   -> nc
lookupFrequency (T a c g t n _ _ _ _ _) (s:ss) = lookupFrequency next ss
  where next = case s of 'A' -> a; 'C' -> c; 'G' -> g; 'T' -> t; _   -> n
 
-- allocate, read, and return the main fasta data
getSection :: String -> String -> String
getSection prefix = map toUpper . concat . getP . lines
  where getP :: [String] -> [String]
        getP = takeWhile (not . isPrefixOf ">") . drop 1 . dropWhile (not . isPrefixOf prefix)

[edit] 7.1 (FASTEST) Generalized Trie with Assoc List #2

This improves on the amount of heap allocation. {{{insertAll}}} constructs the entries in the association lists exacly once, and the TValues are constructed exactly one. This is unlike the iterative construction in the previous code: Trie, Trie w/map, and Trie w/assoc list. -- ChrisKuklewicz

{-# OPTIONS_GHC -O2 -funbox-strict-fields #-}
-- By Chris Kuklewicz and Ketil Malde
-- Trie with efficiently contructed association lists
 
-- May or may not need +RTS -A100m -RTS (or -H100m, or higher values)
-- to run efficiently
import Text.Printf(printf)
import Data.List
import Data.Maybe(maybe)
import Data.Char(ord,chr,toUpper)
 
seqStrings = ["GGT","GGTA","GGTATT","GGTATTTTAATT","GGTATTTTAATTTATAGT"]
 
data TValue = TV {tc :: !Int, tm :: TMap} deriving Show  -- !Int vs Int is little different
type TMap = [(Char,TValue)]
 
main :: IO ()
main = do three <- getSection ">THREE" =<< getContents
          let t = insertAll (tails three)
          mapM_ (showFrequencies t) [1,2]
          mapM_ (\s->putStrLn $ (show $ lookupFrequency t s)++('\t':s)) seqStrings
 
-- The (,) tuple and TV constructors are only used once which helps perfomance.
-- The list of strings cannot contain a null string unless it is the last one.
insertAll :: [String] -> TMap
insertAll ((h:t):ss) = (h,TV {tm = insertAll gatherH, tc = length gatherH}) : insertAll withoutH
    where gatherH = t : map tail (filter isH ss)
          withoutH = filter notH ss
          isH (x:_) = h==x
          isH []    = False
          notH (x:_) = h/=x
          notH []    = False
insertAll _ = []
 
showFrequencies :: TMap -> Int -> IO ()
showFrequencies t k = let printBSF = uncurry $ printf "%s %.3f\n"
                          asPercent = map convert (writeFrequencies k t)
                              where convert (s,f) = (s,100 * fromIntegral f / tot)
                                    tot = fromIntegral ( sum $ map (tc.snd) t) :: Double
                          freqAndKey (k1,x) (k2,y) = compare (y,k1) (x,k2)
                      in do mapM_ printBSF (sortBy freqAndKey asPercent)
                            putChar '\n'
 
writeFrequencies :: Int -> TMap -> [(String,Int)]
writeFrequencies 1 t = map (\(c,tv) -> (c:[],tc tv)) (sortBy (\a b -> fst a `compare` fst b) t)
writeFrequencies k t = let mapc :: Char -> [(String,Int)] -> [(String,Int)]
                           mapc c = map (\(s,i) -> (c:s,i))
                           write :: (Char,TValue) -> [(String,Int)]
                           write (c,tv) = mapc c (writeFrequencies (k-1) (tm tv))
                       in concatMap write (sortBy (\a b -> fst a `compare` fst b) t)
 
lookupFrequency :: TMap -> String -> Int
lookupFrequency _ [] = 0
lookupFrequency t (s:[]) = maybe 0 tc $ lookup s t
lookupFrequency t (s:ss) = maybe 0 (\tv -> lookupFrequency (tm tv) ss) $ lookup s t
 
getSection :: String -> String -> IO String
getSection prefix = return . map toUpper . concat . getP . lines
  where getP :: [String] -> [String]
        getP = takeWhile ((/='>').head) . tail . dropWhile (not . isPrefixOf prefix)

[edit] 7.2 Compact form of Trie w/Assoc list #2

Edited for lines of code and clarity of variable/function names. Compile with -O2 -optc-O3 -funbox-strict-fields. Perhaps run with some of +RTS -H100m -K32m -RTS.

My favorite trick here is {{{compare `mappend` compare}}}.

-- By Chris Kuklewicz and Ketil Malde
-- Trie with efficiently contructed association lists
import Text.Printf(printf)
import Data.List(tails,sortBy,isPrefixOf)
import Data.Maybe(maybe)
import Data.Char(ord,chr,toUpper)
import Data.Monoid(mappend)
 
default(Int)
data TValue = TValue {count :: !Int, tmap :: TMap}
type TMap = [(Char,TValue)]
 
seqStrings = ["GGT","GGTA","GGTATT","GGTATTTTAATT","GGTATTTTAATTTATAGT"]
 
main = do section <- return . insertAll . tails . (getSection ">THREE") =<< getContents
          mapM_ (showFrequencies section) [1,2]
          mapM_ (\s->putStrLn $ (show $ lookupFrequency section s)++('\t':s)) seqStrings
 
insertAll ((h:t):ss) = (h,TValue {tmap = insertAll gatherH, count = length gatherH}) : insertAll (filter notH ss)
    where gatherH = t : map tail (filter isH ss)
          isH  [] = False ; isH  (x:_) = h==x
          notH [] = False ; notH (x:_) = h/=x
insertAll _ = []
 
getSection prefix = map toUpper . concat . justSection . lines
  where justSection = takeWhile (('>'/=).head) . tail . dropWhile (not . isPrefixOf prefix)
 
showFrequencies t k = let seqAndPercent = sortBy freqAndKey (map toPercent (kFrequencies k t))
                              where freqAndKey (k1,x) (k2,y) = compare y x `mappend` compare k1 k2
                                    toPercent (s,f) = (s,100 * fromIntegral f / total)
                                    total = (fromIntegral $ sum $ map (count.snd) t) :: Double
                      in do mapM_ (uncurry $ printf "%s %.3f\n") seqAndPercent
                            putChar '\n'
 
kFrequencies 1 t = map (\(c,tv) -> (c:[],count tv)) t
kFrequencies k t = let prepend c = map (\(s,i) -> (c:s,i))
                       next (c,tv) = prepend c (kFrequencies (k-1) (tmap tv))
                   in concatMap next t
 
lookupFrequency t (s:[]) = maybe 0 count $ lookup s t
lookupFrequency t (s:ss) = maybe 0 (\tv -> lookupFrequency (tmap tv) ss) $ lookup s t

[edit] 8 Previous current entry

This is an attempt to make the standard HashTable perform well enough. On my powerbook G4 it is slightly faster than the current shootout entry and slightly faster than Don's packed buffer below. It uses 43 MB of RSIZE. The winning c-code entry uses a hash table at the shootout site. Perhaps we should FFI the same hash table code.

Compile with -O3 -optc-O3 -fasm -fglasgow-exts, Run with +RTS -H40m -RTS

This no longer uses "Seq = Seq !Int !(Ptr Word8)" but just "Ptr Word8", since the frameSize is known. The code has been shortened and tightened quite a bit. It has been submitted.

{-# OPTIONS -fglasgow-exts -fbang-patterns -funbox-strict-fields #-}
--
-- The Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Contributed by Don Stewart
-- Uses a port of the simple hashtable from the Clean entry
--
 
import GHC.Exts
import GHC.IOBase
 
import Foreign
import Char
import List
import Maybe
import Text.Printf
 
import Data.ByteString.Internal
import qualified Data.ByteString.Char8 as S
 
import Data.Array.Base
import qualified Data.Array.IO as A
 
main = do
    (PS fp o l) <- get (S.pack ">TH")
    withForeignPtr fp $ \p -> do
        let sec = p `plusPtr` o
        mapM_ (writeFreqs l sec) [1,2]
        mapM_ (writeFrame l sec) =<< mapM toseq ["GGT","GGTA",
                                                 "GGTATT","GGTATTTTAATT",
                                                 "GGTATTTTAATTTATAGT"]
 
get p = do
    s <- S.getContents
    let Just n = S.findSubstring p s
    return $! S.map toUpper             -- array fusion!
            . S.filter    ((/=) '\n')
            . S.dropWhile ((/=) '\n')
            . S.copy
            . S.drop n $ s
 
writeFreqs size p n = do
    h   <- htNew n size
    htInsert size p n h
    let vs = htNodes h
    mapM_ draw (sortBy kf vs)
    putChar '\n'
  where
    draw (Node p f) = printf "%s %.3f\n" (ppr n p) pct
        where pct   = (100 * (fromIntegral f) / total) :: Double
              total = fromIntegral (size - n + 1)
 
    kf (Node k x) (Node j y) = compare (y, ppr n k) (x, ppr n j)
 
writeFrame size p (n,k) = do
    h <- htNew n size
    htInsert size p n h
    Node k v <- htFind k h
    putStrLn $ (show v) ++ ('\t' : ppr n k)
 
ppr n p = inlinePerformIO (map w2c `fmap` peekArray n p)
toseq s = fmap ((,) (length s)) (newArray0 0 (map c2w s))
 
------------------------------------------------------------------------
--
-- An implementation of simpl_hash.c in Haskell
--
 
data Hash    = HT !Int !Int !(A.IOArray Int Buckets)
 
data Buckets = Empty | Bucket !(Ptr Word8) !Int | Buckets [Node]
 
data Node    = Node !(Ptr Word8) !Int
 
htNew :: Int -> Int -> IO Hash
htNew !fl !sz = HT fl nprime `fmap` A.newArray (0,nprime-1) Empty
  where
    n      = htSize fl sz
    nprime = head (dropWhile (< n) primes)
 
htSize :: Int -> Int -> Int
htSize !fl !buflen = min lim (go (fl-1) 4)
  where
    lim = (buflen - fl) `div` 1024
    go !n !m | n > 0 && m < lim      = go (n-1) (m*4)
             | otherwise             = m
 
htInsert :: Int -> Ptr Word8 -> Int -> Hash -> IO ()
htInsert !s !p n !h = mapM_ (htInc h . plusPtr p) [0..s-n]
 
htInc :: Hash -> Ptr Word8 -> IO ()
htInc (HT n size arr) k  =
    case htHash size n k of
        !i -> do b <- unsafeRead arr i
                 unsafeWrite arr i $! inc b
  where
    equal = eq n
 
    inc :: Buckets -> Buckets
    inc (Bucket !k' !v)
        | k' `equal` k = Bucket  k' (v+1)
        | otherwise    = Buckets $ Node k' v : [Node k 1]
    inc (Buckets b)    = Buckets $ incL b
    inc Empty          = Bucket k 1
 
    incL :: [Node] -> [Node]
    incL (!i@(Node k' v):ls)
        | k' `equal` k = Node k' (v+1) : ls
        | otherwise    = i : incL ls
    incL []            = [Node k 1]
 
htNodes :: Hash -> [Node]
htNodes (HT _ size arr) = items 0
  where
    read i = inlinePerformIO $! unsafeRead arr i
 
    items !i | i >= size = []
             | otherwise = items_bucket (read i) (i+1)
 
    items_bucket !(Bucket !k' !v) i = Node k' v : items i
    items_bucket !(Buckets !b) i    = items_list b i
    items_bucket Empty        !i    = items i
 
    items_list (!e:l) !i = e : items_list l i
    items_list []     !i = items i
 
htFind :: Ptr Word8 -> Hash -> IO Node
htFind !k !(HT n size arr) = do
    let !i = htHash size n k
    v <- unsafeRead arr i
    return $! find v
  where
    equal = eq n
 
    find  (Bucket k' v) | k' `equal` k = Node k' v
                        | otherwise    = Node k  0
    find  (Buckets l)   = find' l
    find  Empty         = Node k 0
 
    find' (i@(Node !k' _):ls) | k' `equal` k = i
                              | otherwise    = find' ls
    find' []           = Node k 0
 
htHash :: Int -> Int -> Ptr Word8 -> Int
htHash (I# max) (I# size) (Ptr p) = abs . inlinePerformIO . IO $ go p 0#
  where
    lim = p `plusAddr#` size
    go p acc !s
        | p `geAddr#` lim = (# s, I# (acc `remInt#` max) #)
        | otherwise       = case readInt8OffAddr# p 0# s of
                (# s, i #) -> go (p `plusAddr#` 1#) (5# *# acc +# i) s
 
-- A fast Ptr comparison for Hash keys
eq :: Int -> Ptr Word8 -> Ptr Word8 -> Bool
eq !n p q = inlinePerformIO $ do
    a <- peek p :: IO Word8
    b <- peek q :: IO Word8
    if a /= b then return False
              else go n p q
  where
    go !n !p !q
        | n == 0    = return True
        | otherwise = do
            a <- peek p :: IO Word8
            b <- peek q :: IO Word8
            if a /= b then return False
                      else go (n-1) (p `plusPtr` 1) (q `plusPtr` 1)
 
primes = [ 53,       97,        193,       389,       769,
           1543,     3079,      6151,      12289,     24593,
           49157,    98317,     196613,    93241,     786433,
           1572869,  3145739,   6291469,   12582917,  25165843,
           50331653, 100663319, 201326611, 402653189, 805306457 ]

[edit] 9 Current entry

by Stephen Blackheath

This improves on Don Stewart's by speeding the hash table up a fair bit, adding parallelism, and it also complies better with the rules by implementing the required automatic growing of the hash table.

{-# LANGUAGE BangPatterns #-}
{-# OPTIONS_GHC -O2 #-}
 
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- contributed by Stephen Blackheath (with some bits taken from Don Stewart's
--     version), v1.2
 
import Text.Printf
import Data.ByteString.Internal
import qualified Data.ByteString.Char8 as S
import Control.Applicative
import Control.Monad
import Control.Concurrent
import Foreign.Storable
import Foreign.Marshal.Alloc
import Foreign.Marshal.Array
import Foreign.Ptr
import Foreign.ForeignPtr
import Data.Word
import Data.Bits
import Data.Char
import Data.List
import Data.Maybe
import Data.IORef
import GHC.Exts
 
 
main = do
    genome <- extract (S.pack ">TH")
    let actions = [
                do
                    a <- printFreqsBySize genome 1
                    b <- printFreqsBySize genome 2
                    return $ a ++ b
            ] ++ map (printFreqsSpecific genome) specificSeqs
    output <- concat <$> parallel actions
    forM_ output putStrLn
 
-- Drop in replacement for sequence
parallel :: [IO a] -> IO [a]
parallel actions = do
    vars <- forM actions $ \action -> do
        var <- newEmptyMVar
        forkIO $ do
            answer <- action
            putMVar var answer
        return var
    forM vars takeMVar
 
specificSeqs = map S.pack [
    "GGT","GGTA","GGTATT","GGTATTTTAATT","GGTATTTTAATTTATAGT"]
 
extract p = do
    s <- S.getContents
    let (_, rem)  = S.breakSubstring p s
    return $! S.map toUpper             -- array fusion!
            . S.filter    ((/=) '\n')
            . S.dropWhile ((/=) '\n') $ rem
 
printFreqsBySize :: S.ByteString -> Int -> IO [String]
printFreqsBySize genome keySize = do
        ht0 <- htNew keySize
        ht <- hashGenome genome keySize ht0
        l <- htToList ht
        htFree ht
        return $ map draw (sortBy sortRule l) ++ [""]
    where
        genomeLen = S.length genome
        draw :: (S.ByteString, Int) -> String
        draw (key, count) = printf "%s %.3f" (S.unpack key) pct
            where pct   = (100 * (fromIntegral count) / total) :: Double
                  total = fromIntegral (genomeLen - keySize + 1)
 
printFreqsSpecific :: S.ByteString -> S.ByteString -> IO [String]
printFreqsSpecific genome seq = do
    let keySize = S.length seq
        genomeLen = S.length genome
    ht0 <- htNew keySize
    ht <- hashGenome genome keySize ht0
    let (fp, offset, len) = toForeignPtr seq
    count <- withForeignPtr fp $ \p_ -> do
        htGet ht (p_ `plusPtr` offset)
    htFree ht
    return [show count ++ ('\t' : S.unpack seq)]
 
hashGenome :: S.ByteString
           -> Int
           -> Hashtable
           -> IO Hashtable
{-# INLINE hashGenome #-}
hashGenome genome keySize ht = do
    let (fp, offset, len) = toForeignPtr genome
    withForeignPtr fp $ \p_ -> do
        let p = p_ `plusPtr` offset
            loop ht idx = do
                let key = p `plusPtr` idx
                htInc ht key
            endIdx = len - keySize + 1
            foldMe i ht | i == endIdx = return ht
            foldMe i ht = loop ht i >>= foldMe (i+1)
        foldMe 0 ht
 
sortRule :: (S.ByteString, Int) -> (S.ByteString, Int) -> Ordering
sortRule (a1, b1) (a2, b2) = compare (b2, a1) (b1, a2)
 
------ Hash table implementation ----------------------------------------------
 
-- Note: Hash tables are not generally used in functional languages, so there
-- are no available library implementations for Haskell.  This benchmark
-- requires a hash table.  This is why I have implemented the hash table here.
 
htNew :: Int -> IO Hashtable
htNew = htNew' (head primes)
 
htNew' :: Int -> Int -> IO Hashtable
htNew' slots ksz = do
    let ssz = spineSize ksz slots
    sp <- mallocBytes ssz
    memset sp 0 (fromIntegral ssz)
    return $ Hashtable {
            keySize   = ksz,
            noOfSlots = slots,
            spine     = sp
        }
 
primes = [ 1543,     3079,      6151,      12289,     24593,
           49157,    98317,     196613,    93241,     786433,
           1572869,  3145739,   6291469,   12582917,  25165843,
           50331653, 100663319, 201326611, 402653189, 805306457 ]
 
htFree :: Hashtable -> IO ()
htFree ht = do
    htTraverse ht $ \isSpine (Entry ePtr) -> when (not isSpine) $ free ePtr
    free (spine ht)
 
htGet :: Hashtable -> Ptr Word8 -> IO Int
htGet ht key = do
    hash <- calcHash ht key
    htPayload ht hash key >>= peek
 
htInc :: Hashtable -> Ptr Word8 -> IO Hashtable
{-# INLINE htInc #-}
htInc !ht !key = do
    hash <- calcHash ht key
    pPayload <- htPayload ht hash key
    value <- peek pPayload
    poke pPayload (value+1)
    if value == 0 && (hash .&. 0x7f) == 0
        then checkGrow ht
        else return ht
 
htPut_ :: Hashtable -> Ptr Word8 -> Int -> IO ()
{-# INLINE htPut_ #-}
htPut_ !ht !key !value = do
    hash <- calcHash ht key
    pPayload <- htPayload ht hash key
    poke pPayload value
 
checkGrow :: Hashtable -> IO Hashtable
checkGrow ht = do
        let pTotal = totalEntriesOf ht
            slots = noOfSlots ht
        total <- (0x200+) <$> peek pTotal
        poke pTotal total
        if total >= slots
            then do
                let newSlots = head $ dropWhile (<= slots*2) primes
                ht' <- htNew' newSlots (keySize ht)
                htTraverse ht $ \_ -> reinsert ht' -- re-insert all the elts
                htFree ht
                poke (totalEntriesOf ht') total -- copy the total entry count
                return ht'
            else return ht
    where
        reinsert :: Hashtable -> Entry -> IO ()
        reinsert ht entry = do
            value <- peek (payloadOf entry)
            htPut_ ht (keyOf entry) value
 
htToList :: Hashtable -> IO [(S.ByteString, Int)]
htToList ht =
    htMap (\entry -> do
        keyStr <- keyString ht (keyOf entry)
        payload <- peek (payloadOf entry)
        return (keyStr, payload)) ht
 
htMap :: (Entry -> IO a) -> Hashtable -> IO [a]
htMap f ht = mapM f =<< htEntries ht
 
keyString :: Hashtable -> Ptr Word8 -> IO S.ByteString
keyString ht key = S.pack . map w2c <$> peekArray (keySize ht) key
 
isEmptySlot :: Entry -> IO Bool
{-# INLINE isEmptySlot #-}
isEmptySlot entry = do
    ch <- peek $ keyOf entry
    return $ ch == 0
 
htEntries :: Hashtable -> IO [Entry]
htEntries ht = do
    es <- newIORef []
    htTraverse ht $ \_ entry -> modifyIORef es $ \l -> entry:l
    readIORef es
 
htTraverse :: Hashtable -> (Bool -> Entry -> IO ()) -> IO ()
htTraverse ht f = he 0
    where
        slots = noOfSlots ht
        he i | i == slots = return ()
        he i = do
            let entry = indexEntry ht i
            empty <- isEmptySlot entry
            if empty
                then he (i+1)
                else links True i entry
        links isSpine i entry = do
            next <- peek $ nextPtrOf entry
            f isSpine entry
            if next == nullPtr
                then he (i+1)
                else links False i (Entry next)
 
data Hashtable = Hashtable {
        keySize   :: Int,
        noOfSlots :: Int,
        spine     :: Ptr Word8
    }
 
wordSize :: Int
wordSize = max (sizeOf (nullPtr :: Ptr Word8)) (sizeOf (0 :: Int))
 
-- Round up to word size
roundUp :: Int -> Int
{-# INLINE roundUp #-}
roundUp !i = (i + wordSize - 1) .&. complement (wordSize - 1)
 
slotSize :: Int -> Int
{-# INLINE slotSize #-}
slotSize !ksz = roundUp ksz + wordSize * 2
 
spineSize :: Int -> Int -> Int
spineSize ksz slots = slots * slotSize ksz + wordSize
 
calcHash :: Hashtable -> Ptr Word8 -> IO Int
{-# INLINE calcHash #-}
calcHash !ht !key = (`mod` noOfSlots ht) <$> ch 0 0
    where
        ksz = keySize ht
        ch :: Int -> Int -> IO Int
        ch !i !acc | i == ksz = return acc
        ch !i !acc = do
            c <- peek (key `plusPtr` i)
            ch (i+1) (acc * 131 + fromIntegral (c::Word8))
 
newtype Entry = Entry (Ptr Word8)
 
-- Count of the total number of hash table entries
totalEntriesOf :: Hashtable -> Ptr Int
{-# INLINE totalEntriesOf #-}
totalEntriesOf ht = castPtr $ spine ht
 
indexEntry :: Hashtable -> Int -> Entry
{-# INLINE indexEntry #-}
indexEntry !ht !hash =
    let hOffset = wordSize + hash * slotSize (keySize ht)
    in  Entry $ spine ht `plusPtr` hOffset
 
entryMatches :: Hashtable -> Entry -> Ptr Word8 -> IO Bool
{-# INLINE entryMatches #-}
entryMatches !ht !entry !key = do
    let eKey = keyOf entry
    c <- memcmp key eKey (fromIntegral $ keySize ht)
    if c == 0
        then return True
        else do
            empty <- isEmptySlot entry
            if empty
                then do
                    memcpy eKey key (fromIntegral $ keySize ht)  -- ick
                    return True
                else
                    return False
 
nextPtrOf :: Entry -> Ptr (Ptr Word8)
{-# INLINE nextPtrOf #-}
nextPtrOf !(Entry ePtr) = castPtr $ ePtr
 
payloadOf :: Entry -> Ptr Int
{-# INLINE payloadOf #-}
payloadOf !(Entry ePtr) = castPtr $ ePtr `plusPtr` wordSize
 
keyOf :: Entry -> Ptr Word8
{-# INLINE keyOf #-}
keyOf !(Entry ePtr) = ePtr `plusPtr` (wordSize*2)
 
allocEntry :: Hashtable -> Ptr Word8 -> IO Entry
allocEntry !ht !key = do
    let esz = slotSize $ keySize ht
    ePtr <- mallocBytes esz
    memset ePtr 0 (fromIntegral esz)
    let entry = Entry ePtr
    memcpy (keyOf entry) key (fromIntegral $ keySize ht)
    return entry
 
htPayload :: Hashtable -> Int -> Ptr Word8 -> IO (Ptr Int)
{-# INLINE htPayload #-}
htPayload !ht !hash !key = do
        entry <- findEntry (indexEntry ht hash)
        return $ payloadOf entry
    where
        findEntry :: Entry -> IO Entry
        findEntry !entry = do
            match <- entryMatches ht entry key
            if match
                then
                    return entry
                else do
                    let pNext = nextPtrOf entry
                    next <- peek pNext
                    if next == nullPtr
                        then do
                            newEntry@(Entry ePtr) <- allocEntry ht key
                            poke pNext ePtr
                            return newEntry
                        else
                            findEntry (Entry next)

[edit] 10 Old code

[edit] 10.1 A packed buffer entry

20% slower than the current entry, uses around 1/3 of the space though. Using IntMap where possible is good for speed. Still, we should be able to do *much* better.

I wonder what this would be like using a Hashtable instead of a Map?

{-# OPTIONS -O2 -optc-O3 -fglasgow-exts -fasm -funbox-strict-fields #-}
--
-- Translation of the C version by Don Stewart
--
 
import GHC.Base
import GHC.Ptr
import GHC.IOBase
import GHC.Word
import qualified Data.Map    as M
import qualified Data.IntMap as I
import Data.Char
import Foreign
import System.IO
import qualified Control.Exception as C
import System.IO.Unsafe
import Control.Monad
import Data.List
import Text.Printf
 
main = do 
    line         <- mallocArray0 256        :: IO (Ptr Word8)
    buf0         <- mallocArray0 10240      :: IO (Ptr Word8)
    dropUntilThree line
    (buf,seqlen) <- takeSequence buf0 0 10240
    writeFreq 1 buf (seqlen-1)
    writeFreq 2 buf (seqlen-1)
    mapM_ (writeCount buf (seqlen-1)) seqs 
 
seqs= [(3,Ptr "GGT"#),(4,Ptr "GGTA"#),(6,Ptr "GGTATT"#)
      ,(12,Ptr "GGTATTTTAATT"#),(18,Ptr "GGTATTTTAATTTATAGT"#)]
 
-- Fill the buffer with sequence three
takeSequence buf off sz = do
    let p = buf `plusPtr` off :: Ptr Word8
    eof <- getLineW8 p
    i   <- lengthArray0 0 p 
    c   <- peek p >>= return . w2c
    let off' = off + i
    if eof || c == '>' 
           then do buf' <- reallocArray0 buf off'
                   return (buf', off')
           else if off' + 512 >= sz
                    then do let sz' = sz + 10240
                            buf' <- reallocArray0 buf sz'
                            takeSequence buf' off' sz'
                    else takeSequence buf  off' sz
 
dropUntilThree buf = do 
    getLineW8 buf
    x <- peek buf; y <- peek (buf `plusPtr` 1); z <- peek (buf `plusPtr` 2)
    case (w2c x, w2c y, w2c z) of 
        ('>','T','H') -> lengthArray0 0 buf
        _             -> dropUntilThree buf
 
 
w2c = chr . fromIntegral
 
-- read line by line into Word8 arrays
getLineW8 (buf :: Ptr Word8) = C.handle (const $ return True) $
    getLine >>= pokeArray0 0 buf . unsafeCoerce# . map toUpper >> return False
 
-- write code and percentage frequency for 1 and 2-nuc sequences, sorted by freq then key
writeFreq k (buf :: Ptr Word8) len = do
    m <- counts k buf len (I.empty :: I.IntMap Int) (I.insertWith (+)) pack
    let ls    = I.toList m
        total = fromIntegral . sum . map snd $ ls
        freqs = sortBy freqAndKey ls
    mapM_ (\(i,num) -> printf "%s %.3f\n" (unpack i k []) (percent num total)) freqs
    putChar '\n'
 
    where freqAndKey (k1,x) (k2,y) = case y `compare` x of EQ -> k1 `compare` k2 ; z -> z
          percent n t = 100 * (fromIntegral n) / t :: Float
 
writeCount (buf :: Ptr Word8) len (sl,p@(Ptr s)) = do
    c <- if len <= 4
        then do m <- counts sl buf len (I.empty :: I.IntMap Int) (I.insertWith (+)) pack
                k <- pack p (sl-1) 0
                return $ I.findWithDefault 0 k m
 
        else do m <- {-# SCC "countLongNucleotides" #-} counts sl buf len (M.empty :: M.Map P Int)
                        (M.insertWith (+)) (\p _ _ -> return $! P p sl)
                return $! M.findWithDefault 0 (P p sl) m
 
    putStr ((show c) ++ "\t")
    hPutBuf stdout p sl
    putChar '\n'
 
-- update a hash of k-nucleotide strings to their frequency
counts k (buf :: Ptr Word8) len zero update newkey = loop 0 zero
    where loop i m | i `seq` m `seq` False = undefined -- strictify yourself up
                   | i > len - k + 1 = return m
                   | otherwise = do key <- newkey (buf `plusPtr` i) (k-1) 0
                                    loop (i+1) (update key 1 m)
 
pack :: Ptr Word8 -> Int -> Int -> IO Int
pack p 0 n = do c <- peek p :: IO Word8 ; return $! n .|. (fromIntegral c)
pack p j n = do c <- peek (p `plusPtr` j) :: IO Word8
                pack p (j-1) $! n .|. (fromIntegral c `shiftL` (8*j))
 
unpack _ 0 s = s
unpack w j s = let c = chr $ (w `shiftR` (8 * (j-1))) .&. 0xff in unpack w (j-1) $! c:s
 
------------------------------------------------------------------------
--
-- Poor man's packed string
--
data P = P !(Ptr Word8) !Int
 
instance Eq  P where a == b = (comparePS a b) == EQ
instance Ord P where compare  = comparePS
 
comparePS (P _ 0)   (P _ 0) = EQ  -- short cut for empty strings
comparePS (P (Ptr p1) (I# l1)) (P (Ptr p2) (I# l2)) = 
    inlinePerformIO $ IO $ \s -> cmp p1 p2 0# l1 l2 s
 
cmp p1 p2 n len1 len2 s =
  if n ==# len1
        then if n ==# len2 then (# s, EQ #) else (# s, LT #)
        else if n ==# len2 then (# s, GT #)
              else case readWord8OffAddr# p1 n s of { (# s, a #) ->
                   case readWord8OffAddr# p2 n s of { (# s, b #) ->
                   case (W8# a) `compare` (W8# b) of 
                       EQ -> cmp p1 p2 (n +# 1#) len1 len2 s
                       LT -> (# s, LT #)
                       GT -> (# s, GT #) } } 
 
inlinePerformIO (IO m) = case m realWorld# of (# _, r #)   -> r
{-# INLINE inlinePerformIO #-}

[edit] 10.2 Original entry

This is the second slowest entry from any language. Space usage is 10x too high, maybe it leaks.

-- knucleotide.hs
--
-- The Great Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Contributed by Einar Karttunen
-- This is a purely functional solution to the problem.
-- An alternative which keeps a mutable table of occurences would be faster.
--
 
import Data.Char
import Data.List
import Numeric
 
counts :: Int -> String -> [String]
counts k dna = filter (not.null) $ map (taker k []) (tails dna)
    where taker 0 acc _      = reverse acc
          taker i acc (x:xs) = taker (i-1) (x:acc) xs
          taker i acc []     = []
 
writeFrequencies k dna =
  let cnt = counts k dna
      tot :: Float
      tot = fromIntegral $ length cnt
      frr = map (\ks -> (head ks, fromIntegral (length ks) * 100 / tot)) $ group $ sort cnt
      frq = sortBy (\(_,x) (_,y) -> y `compare` x) frr
      in mapM_ (\(k,f) -> putStr (k++" "++showFFloat (Just 3) f "\n")) frq >> putStrLn ""
 
writeCount sq dna = putStrLn (show cnt ++ "\t" ++ sq)
    where cnt = length $ filter (\c -> c==sq) $ counts (length sq) dna
 
dnaThree = process =<< getContents
    where process ls  = return $ ul $ takeNorm $ tail $ dropComment $ dropOther $ lines ls
          dropOther   = dropWhile (\str -> not (">THREE" `isPrefixOf` str))
          dropComment = dropWhile (\str -> head str == ';')
          takeNorm    = takeWhile (\str -> head str /= '>')
          ul str      = map toUpper $ concat str
 
main = do three <- dnaThree
          writeFrequencies 1 three
          writeFrequencies 2 three
          mapM_ (\k -> writeCount k three) ["GGT", "GGTA", "GGTATT", "GGTATTTTAATT", "GGTATTTTAATTTATAGT"]

[edit] 11 Bjorn Bringert

I think that the entries above are cheating a bit. In writeCount, they use filter to count the occurences of the given nucleotide, but do not count all the other ones. The benchmark description says: "count all the 3- 4- 6- 12- and 18-nucleotide sequences, and write the count and code for specific sequences". The description also says thet the nucleotide sequences shoule be "sorted by descending frequency and then ascending k-nucleotide key", but the entries above only sort by frequency.

This solution uses less memory (193 MB vs 268 MB on my machine), but takes more than twice as much time (74 s vs 32 s). I have tried using Data.HashTable, but the performace seems to be about same as when using Data.Map. Using FastPackedString reduces the memory use to 91 MB and time to 52 s, but FastpackedString is not available on the benchmark machine, I guess.

-- knucleotide.hs
--
-- The Great Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- By Bjorn Bringert, based on solution by Einar Karttunen
 
import Data.Char
import Data.List
import Numeric
import qualified Data.Map as Map
 
prefixes :: Int -> String -> [String]
prefixes k dna = filter (not.null) $ map (taker k []) (tails dna)
    where taker 0 acc _      = reverse acc
          taker i acc (x:xs) = taker (i-1) (x:acc) xs
          taker i acc []     = []
 
counts :: Int -> String -> Map.Map String Int
counts k = Map.fromListWith (+) . map (flip (,) 1) . prefixes k
 
writeFrequencies k dna = mapM_ (\(k,c) -> putStrLn (k++" "++showfr c)) frq
  where tot = fromIntegral $ length dna + 1 - k :: Float
        frq = sortBy (\(_,x) (_,y) -> y `compare` x) $ Map.toList $ counts k dna
         -- Needed if counts list is not sorted by k-nucleotide
--        frq = sortBy (\(a,x) (b,y) -> compare y x `mappend` compare a b) $ counts ps
        showfr c = showFFloat (Just 3) (fromIntegral c * 100 / tot) ""
 
-- CHEAT, doesn't calcualte all frequencies
--writeCount dna sq = putStrLn (show cnt ++ "\t" ++ sq)
--    where cnt = length $ filter (==sq) $ prefixes (length sq) dna
 
writeCount dna sq = putStrLn (show cnt ++ "\t" ++ sq)
    where cnt = Map.findWithDefault 0 sq $ counts (length sq) dna
 
dnaThree = ul . takeNorm . tail . dropComment . dropOther . lines
  where dropOther   = dropWhile (not . (">THREE" `isPrefixOf`))
        dropComment = dropWhile ((==';') . head)
        takeNorm    = takeWhile ((/='>') . head)
        ul          = map toUpper . concat
 
main = do three <- fmap dnaThree getContents
          writeFrequencies 1 three >> putStrLn ""
          writeFrequencies 2 three >> putStrLn ""
          mapM_ (writeCount three) ["GGT", "GGTA", "GGTATT", "GGTATTTTAATT", "GGTATTTTAATTTATAGT"]