Shootout/Fasta
From HaskellWiki
This is a Shootout Entry for [1].
The description of the program:
Each program should
- encode the expected cumulative probabilities for 2 alphabets
- generate DNA sequences, by weighted random selection from the alphabets (using the pseudo-random number generator from the random benchmark)
- generate DNA sequences, by copying from a given sequence
- write 3 sequences line-by-line in FASTA format
We'll use the generated FASTA file as input for other benchmarks (reverse-complement, k-nucleotide).
Contents |
1 Current entry
Same as previous, but with fewer strictness annotations thanks to strictify.
{-# OPTIONS -fvia-C -O2 -optc-O2 -optc-ffast-math -fbang-patterns -fexcess-precision #-} -- -- The Computer Language Benchmarks Game -- http://shootout.alioth.debian.org/ -- -- Contributed by Don Stewart -- A lazy bytestring solution. -- Unnecessary strictness annotations removed by Sterling Clover 2/08 -- -- Add: -- -optc-mfpmath=sse -optc-msse2 -- import System import Data.Word import Control.Arrow import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy.Char8 as C (pack,unfoldr) import qualified Data.ByteString as S import Data.ByteString.Internal import Data.ByteString.Unsafe main = do n <- getArgs >>= readIO . head writeFasta "ONE" "Homo sapiens alu" (n*2) (L.cycle alu) g <- unfold "TWO" "IUB ambiguity codes" (n*3) (look iubs) 42 unfold "THREE" "Homo sapiens frequency" (n*5) (look homs) g ------------------------------------------------------------------------ -- -- lazily unfold the randomised dna sequences -- unfold l t n f g = putStrLn (">" ++ l ++ " " ++ t) >> unroll f g n unroll :: (Int -> (Word8, Int)) -> Int -> Int -> IO Int unroll f = loop where loop r 0 = return r loop !r i = case S.unfoldrN m (Just . f) r of (!s, Just r') -> do S.putStrLn s loop r' (i-m) where m = min i 60 look ds k = (choose ds d, j) where (d,j) = rand k choose :: [(Word8,Float)] -> Float -> Word8 choose [(b,_)] _ = b choose ((b,f):xs) p = if p < f then b else choose xs (p-f) ------------------------------------------------------------------------ -- -- only demand as much of the infinite sequence as we require writeFasta label title n s = do putStrLn $ ">" ++ label ++ " " ++ title let (t:ts) = L.toChunks s go ts t n where go ss s n | l60 && n60 = S.putStrLn l >> go ss r (n-60) | n60 = S.putStr s >> S.putStrLn a >> go (tail ss) b (n-60) | n <= ln = S.putStrLn (S.take n s) | otherwise = S.putStr s >> S.putStrLn (S.take (n-ln) (head ss)) where ln = S.length s l60 = ln >= 60 n60 = n >= 60 (l,r) = S.splitAt 60 s (a,b) = S.splitAt (60-ln) (head ss) ------------------------------------------------------------------------ im = 139968 ia = 3877 ic = 29573 rand :: Int -> (Float, Int) rand seed = (newran,newseed) where newseed = (seed * ia + ic) `rem` im newran = 1.0 * fromIntegral newseed / imd imd = fromIntegral im ------------------------------------------------------------------------ alu = C.pack "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\ \GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\ \CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\ \ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\ \GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\ \AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\ \AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA" iubs = map (c2w *** id) [('a',0.27),('c',0.12),('g',0.12),('t',0.27),('B',0.02) ,('D',0.02),('H',0.02),('K',0.02),('M',0.02),('N',0.02) ,('R',0.02),('S',0.02),('V',0.02),('W',0.02),('Y',0.02)] homs = map (c2w *** id) [('a',0.3029549426680),('c',0.1979883004921) ,('g',0.1975473066391),('t',0.3015094502008)]
2 Previous entry
Pretty fast, uses some cute lazy bytestring tricks. Compile with -optc-mfpmath=sse -optc-msse2
{-# OPTIONS -O2 -optc-O2 -optc-ffast-math -fbang-patterns -fexcess-precision #-} -- -- The Computer Language Benchmarks Game -- http://shootout.alioth.debian.org/ -- -- Contributed by Don Stewart -- A lazy bytestring solution. -- -- Add: -- -optc-mfpmath=sse -optc-msse2 -- import System import Data.Word import Control.Arrow import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy.Char8 as C (pack) import qualified Data.ByteString as S import Data.ByteString.Internal main = do n <- getArgs >>= readIO . head writeFasta "ONE" "Homo sapiens alu" (n*2) (L.cycle alu) g <- unfold "TWO" "IUB ambiguity codes" (n*3) (look iubs) 42 unfold "THREE" "Homo sapiens frequency" (n*5) (look homs) g ------------------------------------------------------------------------ -- -- lazily unfold the randomised dna sequences -- unfold l t n f !g = putStrLn (">" ++ l ++ " " ++ t) >> unroll f g n unroll :: (Int -> (Word8, Int)) -> Int -> Int -> IO Int unroll f = loop where loop r 0 = return r loop !r !i = case S.unfoldrN m (Just . f) r of (!s, Just r') -> do S.putStrLn s loop r' (i-m) where m = min i 60 look ds !k = let (d,j) = rand k in (choose ds d, j) choose :: [(Word8,Float)] -> Float -> Word8 choose [(b,_)] _ = b choose ((!b,!f):xs) !p = if p < f then b else choose xs (p-f) ------------------------------------------------------------------------ -- -- only demand as much of the infinite sequence as we require writeFasta label title n s = do putStrLn $ ">" ++ label ++ " " ++ title let (t:ts) = L.toChunks s go ts t n where go ss !t !n | l60 && n60 = S.putStrLn l >> go ss r (n-60) | n60 = S.putStr t >> S.putStrLn a >> go (tail ss) b (n-60) | n <= ln = S.putStrLn (S.take n t) | otherwise = S.putStr t >> S.putStrLn (S.take (n-ln) (head ss)) where !ln = S.length t !l60 = ln >= 60 !n60 = n >= 60 (l,r) = S.splitAt 60 t (a,b) = S.splitAt (60-ln) (head ss) ------------------------------------------------------------------------ im = 139968 ia = 3877 ic = 29573 rand :: Int -> (Float, Int) rand !seed = (newran,newseed) where !newseed = (seed * ia + ic) `rem` im !newran = 1.0 * fromIntegral newseed / imd imd = fromIntegral im ------------------------------------------------------------------------ alu = C.pack "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\ GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\ CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\ ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\ GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\ AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\ AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA" iubs = map (c2w *** id) [('a',0.27),('c',0.12),('g',0.12),('t',0.27),('B',0.02) ,('D',0.02),('H',0.02),('K',0.02),('M',0.02),('N',0.02) ,('R',0.02),('S',0.02),('V',0.02),('W',0.02),('Y',0.02)] homs = map (c2w *** id) [('a',0.3029549426680),('c',0.1979883004921) ,('g',0.1975473066391),('t',0.3015094502008)]
3 A new proposal
Give it some testing if the clean cooseN or the hacked chooseN' works better. Starting with ghc-7 -fvia-C is deprecated, so -fllvm is included with (amd) optimized flags. You can also play around with inlining. The posted version worked best for me (amd64, ghc-7.0.4, llvm-2.7).
{--# OPTIONS -fvia-C -O3 -optc-O3 -fexcess-precision -optc-ffast-math -optc-march=native #-} {-# OPTIONS -fllvm -O3 -fexcess-precision -optlc-mcpu=amdfam10 -optlo-O3 -optlo-prune-eh -optlo-mem2reg -optlo-loop-unswitch -optlo-globalopt -optlc-O3 -optlc-enable-unsafe-fp-math -optlo-memdep -optlo-mem2reg -optlo-scalarrepl -optlo-loop-unroll -funbox-strict-fields #-} {-# LANGUAGE BangPatterns #-} import System.IO import System.Environment import Data.Word import Control.Monad import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy.Char8 as C (pack) import qualified Data.ByteString as S import Data.ByteString.Internal import Foreign.ForeignPtr import Foreign.Ptr import Foreign.Storable updateUnfoldrN :: S.ByteString -> Int -> (a -> Maybe (Word8, a)) -> a -> IO (Maybe a) updateUnfoldrN bs i f x0 | i' <= 0 = do withForeignPtr fp (\ p -> memset p 0 (fromIntegral len)) return (Just x0) | otherwise = withForeignPtr fp $ \p -> do (n, res) <- go p x0 0 memset (p `plusPtr` n) 0 (fromIntegral (len - n)) return res where (fp, off, len) = toForeignPtr bs i' = min i len go !p !x !n | n < i' = case f x of Nothing -> return (n, Nothing) Just (w,x') -> poke p w >> go (p `plusPtr` 1) x' (n+1) | otherwise = return (n, Just x) {-# INLINE updateUnfoldrN #-} main = do n <- getArgs >>= readIO . head putStrLn ">ONE Homo sapiens alu" printBlocks takeN (L.cycle alu) (2*n) let bs = S.replicate 60 (c2w 'x') putStrLn ">TWO IUB ambiguity codes" seed <- printBlocks (chooseN' bs iubs) 42 (3*n) putStrLn ">THREE Homo sapiens frequency" printBlocks (chooseN' bs homs) seed (5*n) ------------------------------------------------------------------------ blockSizes :: Int -> [Int] blockSizes 0 = [] blockSizes n = let m = (min 60 n) in (m : (blockSizes (n-m))) printBlocks :: (a -> Int -> IO a) -> a -> Int -> IO a printBlocks f s n = foldM f s $ (blockSizes n) takeN :: L.ByteString -> Int -> IO L.ByteString takeN bs n = L.putStrLn l >> return r where (l, r) = L.splitAt (fromIntegral n) bs {--# INLINE takeN #-} chooseN :: [(Word8, Double)] -> Int -> Int -> IO Int chooseN ds seed n = S.putStrLn bs >> return seed' where (bs, Just seed') = S.unfoldrN n (chooseNext ds) seed {--# INLINE chooseN #-} chooseN' :: ByteString -> [(Word8, Double)] -> Int -> Int -> IO Int chooseN' bs ds seed n = do Just seed' <- updateUnfoldrN bs n (chooseNext ds) seed S.putStrLn bs return seed' {--# INLINE chooseN' #-} chooseNext :: [(Word8, Double)] -> Int -> Maybe (Word8, Int) chooseNext ds seed = Just (choose ds newran, newseed) where newseed = (seed * ia + ic) `rem` im newran = fromIntegral newseed / imd {--# INLINE chooseNext #-} choose :: [(Word8, Double)] -> Double -> Word8 choose [(b,_)] _ = b choose ((b,f):xs) p = if p < f then b else choose xs (p - f) ------------------------------------------------------------------------ im = 139968 imd = 139968.0 ia = 3877 ic = 29573 (***) :: (a -> c) -> (b -> d) -> (a, b) -> (c, d) (***) f g (x, y) = (f x, g y) {--# INLINE (***) #-} ------------------------------------------------------------------------ alu = C.pack "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\ \GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\ \CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\ \ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\ \GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\ \AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\ \AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA" iubs = map (c2w *** id) [('a',0.27),('c',0.12),('g',0.12),('t',0.27),('B',0.02) ,('D',0.02),('H',0.02),('K',0.02),('M',0.02),('N',0.02) ,('R',0.02),('S',0.02),('V',0.02),('W',0.02),('Y',0.02)] homs = map (c2w *** id) [('a',0.3029549426680),('c',0.1979883004921) ,('g',0.1975473066391),('t',0.3015094502008)]
4 Old entry
This entry is similar to the "Old entry" on this page, but has been updated to use Data.ByteString. It appears to be respectably fast, execution time is about 3.5 times the C++ version in initial tests.
It has been submitted.
{- The Computer Language Shootout http://shootout.alioth.debian.org/ contributed by Jeff Newbern updated by Spencer Janssen and Don Stewart -} import System import qualified Data.ByteString.Char8 as B randomSequence :: Int -> [(Char,Double)] -> Int -> (B.ByteString, Int) randomSequence n bf seed = (sequence, seed') where (sequence, Just seed') = B.unfoldrN n f seed f s = Just (chooseBase bf (normalize s), nextSeed s) chooseBase :: [(Char,Double)] -> Double -> Char chooseBase [(b,_)] _ = b chooseBase ((b,f):xs) p | p < f = b | otherwise = chooseBase xs (p-f) writeFasta label title sequence = do putStrLn $ ">" ++ label ++ " " ++ title mapM_ B.putStrLn $ split 60 sequence split n xs | B.null xs = [] | otherwise = l : split n r where (l, r) = B.splitAt n xs main = do n <- getArgs >>= readIO . head let aluLen = 1 + 2 * n `div` B.length alu aluSeq = B.take (2 * n) . B.concat . replicate aluLen $ alu (iubSeq, seed) = randomSequence (3 * n) iub initSeed (homSeq, _) = randomSequence (5 * n) homosapiens seed writeFasta "ONE" "Homo sapiens alu" aluSeq writeFasta "TWO" "IUB ambiguity codes" iubSeq writeFasta "THREE" "Homo sapiens frequency" homSeq alu = B.pack "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\ \GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\ \CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\ \ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\ \GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\ \AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\ \AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA" iub = [ ('a', 0.27), ('c', 0.12), ('g', 0.12), ('t', 0.27), ('B', 0.02) , ('D', 0.02), ('H', 0.02), ('K', 0.02), ('M', 0.02), ('N', 0.02) , ('R', 0.02), ('S', 0.02), ('V', 0.02), ('W', 0.02), ('Y', 0.02) ] homosapiens = [ ('a', 0.3029549426680), ('c', 0.1979883004921), ('g', 0.1975473066391), ('t', 0.3015094502008) ] im = 139968 ia = 3877 ic = 29573 nextSeed s = (s * ia + ic) `rem` im normalize n = (fromIntegral n) * (1.0 / fromIntegral im) initSeed = nextSeed 42
ArthurVanLeeuwen 13:38, 26 January 2007 (UTC) Changing chooseBase to not do the (p-f), adding
cumulative bfs = zip bases (scanl1 (+) frequencies) where (bases,frequencies) = unzip bfs
and passing (cumulative iub) rather than iub (etc.) may give another 10 percent speedup.
5 Current best entry (Haskell GHC #3)
This entry has been disqualified. From the Shootout judges: "Preconverting to Int is a really interesting approach - we're looking for the vanilla approach."
It now has respectable performance and memory usage.
{-# OPTIONS_GHC -O2 -funbox-strict-fields #-} -- The Great Computer Language Shootout -- http://shootout.alioth.debian.org/ -- -- contributed by Jeff Newbern -- Modified to fastest.hs by Chris Kuklewicz, 6 Jan 2006 -- Modified to fixed-fasta.hs by Chris Kuklewicz, 17 Jan 2006 -- -- Uses random generation code derived from Simon Marlow and Einar -- Karttunen's "random" test entry. No longer uses Double during run, -- everything has been pre-converted to Int. And pre-converted to a -- binary tree for lookup. Ideally this tree could be constructed -- with the probabilities in mind, but it isn't in this version. -- -- Compile with ghc --make resub-fasta.hs -o resub-fasta.ghc_run -- Run with "./rsub-fasta.ghc_run %A" where %A is the parameter import Control.Monad import Data.Char(chr,ord) import Data.List(mapAccumL) import Data.Word(Word8) import Data.IORef import Foreign import System(getArgs) import System.IO type Base = Word8 data BaseFrequencyTree = Node !Base | TreeNodes !Int !Base !Base | Tree !Int !BaseFrequencyTree !BaseFrequencyTree data Seed = Seed !Int b2c :: Word8 -> Char b2c = chr . fromEnum c2b = toEnum . ord alu = "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG" ++ "GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA" ++ "CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT" ++ "ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA" ++ "GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG" ++ "AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC" ++ "AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA" im = 139968 :: Double iub = mkTree $ snd . mapAccumL (\rt (c,f) -> (f+rt,(c2b c,ceiling $ im*(f+rt)))) 0.0 $ [ ('a', 0.27), ('c', 0.12), ('g', 0.12), ('t', 0.27), ('B', 0.02), ('D', 0.02), ('H', 0.02), ('K', 0.02), ('M', 0.02), ('N', 0.02), ('R', 0.02), ('S', 0.02), ('V', 0.02), ('W', 0.02), ('Y', 0.02) ] homosapiens = mkTree $ snd . mapAccumL (\rt (c,f) -> (f+rt,(c2b c,ceiling $ im*(f+rt)))) 0.0 $ [ ('a', 0.3029549426680), ('c', 0.1979883004921), ('g', 0.1975473066391), ('t', 0.3015094502008) ] mkTree [(b,_)] = Node b mkTree [(b,f),(b',_)] = TreeNodes f b b' mkTree xs = let (h,t) = splitAt (length xs `div` 2) xs (_,f) = last h in Tree f (mkTree h) (mkTree t) chooseBase (Node b) _ = b chooseBase (TreeNodes f b b') p = if (p<f) then b else b' chooseBase (Tree f l r) p | p < f = chooseBase l p | otherwise = chooseBase r p writeFastaHeader label title = (putStrLn (('>':label) ++ (' ':title))) perLine = 60 writeAluBuffer total = do let l = length alu bufSize = l + perLine - 1 aluBuf <- mallocArray bufSize foldM_ (\ptr c -> poke ptr (c2b c) >> return (advancePtr ptr 1)) aluBuf (take bufSize (cycle alu)) let (full,end) = total `divMod` perLine fullLine n = let ptr = advancePtr aluBuf ((n * perLine) `mod` l) in hPutBuf stdout ptr perLine >> putChar '\n' lastLine = let ptr = advancePtr aluBuf ((full*perLine) `mod` l) in hPutBuf stdout ptr end >> putChar '\n' mapM_ fullLine [0..pred full] when (end>0) lastLine writeWrapped total trans initSeed = do seedRef <- newIORef initSeed let l = succ perLine (im,ia,ic)=(139968,3877,29573) nextSeed (Seed s) = Seed ( (s * ia + ic) `mod` im ) prng = do newSeed <- return.nextSeed =<< readIORef seedRef writeIORef seedRef newSeed return newSeed buf <- mallocArray l poke (advancePtr buf perLine) (c2b '\n') let (full,end) = total `divMod` perLine fill 0 _ = return () fill i ptr = do (Seed b) <- prng poke ptr (trans b) fill (pred i) (advancePtr ptr 1) fullLine = do fill perLine buf hPutBuf stdout buf l lastLine = do fill end buf poke (advancePtr buf end) (c2b '\n') hPutBuf stdout buf (succ end) replicateM_ full fullLine when (end>0) lastLine readIORef seedRef main = do args <- getArgs let n = if null args then 2500000 else read (head args) writeFastaHeader "ONE" "Homo sapiens alu" writeAluBuffer (2*n) writeFastaHeader "TWO" "IUB ambiguity codes" seed' <- writeWrapped (3*n) (chooseBase iub) (Seed 42) writeFastaHeader "THREE" "Homo sapiens frequency" writeWrapped (5*n) (chooseBase homosapiens) seed'
An extremely minor gripe, how about using:
alu = "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\ \GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\ \CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\ \ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\ \GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\ \AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\ \AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"
Rather than all those ++... This won't help performance one bit I would wager, but uses the "proper" way of doing multiline strings.--SebastianSylvan 11:40, 10 November 2006 (UTC)
ArthurVanLeeuwen 14:48, 26 January 2007 (UTC) I noticed that changing chooseBase in this program so that it conforms to the arbitrary rules set by the shootout maintainers makes this program only perform about 10% faster than the ByteString version above.
6 Non-leaking Entry
This a new entry, which is mostly the old entry below modified to use StateT to allow the random number stream to be finite, so no splitAt or drop functions are needed. This fixes the space leak and improves the time by almost a factor of 2. Serious speed will require changing from String to a Word8 array of some sort.
{- The Great Computer Language Shootout http://shootout.alioth.debian.org/ contributed by Jeff Newbern Modified by Chris Kuklewicz, 3 Jan 2006 Uses random generation code derived from Simon Marlow and Einar Karttunen's "random" test entry. Modified version note: Use a StateT around IO to manage the seed. This makes interleaving the generation and output of the sequences easier. -} import Control.Monad (replicateM) import Control.Monad.Trans import Control.Monad.State import System(getArgs) type Base = Char type Sequence = [Base] alu :: Sequence -- predefined sequence alu = "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG" ++ "GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA" ++ "CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT" ++ "ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA" ++ "GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG" ++ "AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC" ++ "AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA" type BaseFrequency = (Base,Double) iub :: [BaseFrequency] iub = [ ('a', 0.27), ('c', 0.12), ('g', 0.12), ('t', 0.27), ('B', 0.02), ('D', 0.02), ('H', 0.02), ('K', 0.02), ('M', 0.02), ('N', 0.02), ('R', 0.02), ('S', 0.02), ('V', 0.02), ('W', 0.02), ('Y', 0.02) ] homosapiens :: [BaseFrequency] homosapiens = [ ('a', 0.3029549426680), ('c', 0.1979883004921), ('g', 0.1975473066391), ('t', 0.3015094502008) ] -- select a base whose interval covers the given double chooseBase :: [BaseFrequency] -> Double -> Base chooseBase [(b,_)] _ = b chooseBase ((b,f):xs) p | p < f = b | otherwise = chooseBase xs (p-f) type Seed = Int type Pseudo a = StateT Seed IO a prng :: Pseudo Double prng = let nextSeed s = (s * ia + ic) `mod` im normalize n = (fromIntegral n) * (1.0 / fromIntegral im) im = 139968; ia = 3877; ic = 29573 in do seed <- get let seed' = nextSeed seed put seed' return (normalize seed') prngN count = replicateM count prng -- write a sequence in Fasta format writeFasta :: String -> String -> Sequence -> IO () writeFasta label title sequence = do putStrLn $ ">" ++ (label ++ (" " ++ title)) writeWrapped 60 sequence where writeWrapped _ [] = return () writeWrapped len str = do let (s1,s2) = splitAt len str putStrLn s1 writeWrapped len s2 writeFastaHeader :: (MonadIO m) => String -> String -> m () writeFastaHeader label title = liftIO $ putStrLn $ ">" ++ (label ++ (" " ++ title)) writeWrapped' :: Int -> Int -> (Double->Base) -> Pseudo () writeWrapped' wrap total trans = work total where work 0 = return () work n = do let c' = min wrap n nextC = c - c' s <- liftM (map trans) (prngN c') liftIO $ putStrLn s work nextC writeWrapped = writeWrapped' 60 main = do args <- getArgs let n = if (null args) then 1000 else read (head args) writeFasta "ONE" "Homo sapiens alu" (take (2*n) (cycle alu)) writeFastaHeader "TWO" "IUB ambiguity codes" seed' <- execStateT (writeWrapped (3*n) (chooseBase iub)) 42 writeFastaHeader "THREE" "Homo sapiens frequency" execStateT (writeWrapped (5*n) (chooseBase homosapiens)) seed'
7 Old Entry
This is the old entry. It has a huge memory leak due to laziness.
-- The Great Computer Language Shootout -- http://shootout.alioth.debian.org/ -- contributed by Jeff Newbern -- Uses random generation code derived from Simon Marlow and Einar Karttunen's -- "random" test entry. -- Note: This code has not been optimized *at all*. It is written to be clear -- and to follow standard Haskell idioms as much as possible (but we have to match -- the stateful PRNG idiom in the test definition, which is oriented toward an -- imperative language). Performance is decent with ghc -O2, but if it can be -- improved without sacrificing the clarity of the code, by all means go for it! import System(getArgs) type Base = Char type Sequence = [Base] alu :: Sequence -- predefined sequence alu = "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG" ++ "GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA" ++ "CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT" ++ "ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA" ++ "GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG" ++ "AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC" ++ "AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA" type BaseFrequency = (Base,Double) iub :: [BaseFrequency] iub = [ ('a', 0.27), ('c', 0.12), ('g', 0.12), ('t', 0.27), ('B', 0.02), ('D', 0.02), ('H', 0.02), ('K', 0.02), ('M', 0.02), ('N', 0.02), ('R', 0.02), ('S', 0.02), ('V', 0.02), ('W', 0.02), ('Y', 0.02) ] homosapiens :: [BaseFrequency] homosapiens = [ ('a', 0.3029549426680), ('c', 0.1979883004921), ('g', 0.1975473066391), ('t', 0.3015094502008) ] -- select a base whose interval covers the given double chooseBase :: [BaseFrequency] -> Double -> Base chooseBase [(b,_)] _ = b chooseBase ((b,f):xs) p | p < f = b | otherwise = chooseBase xs (p-f) -- write a sequence in Fasta format writeFasta :: String -> String -> Sequence -> IO () writeFasta label title sequence = do putStrLn $ ">" ++ (label ++ (" " ++ title)) writeWrapped 60 sequence where writeWrapped _ [] = do return () writeWrapped len str = do let (s1,s2) = splitAt len str putStrLn s1 writeWrapped len s2 -- generate an infinite sequence of random doubles using the -- prng from the "random" test probs :: Int -> [Double] probs seed = tail $ map normalize (iterate nextSeed seed) where nextSeed s = (s * ia + ic) `mod` im im = 139968 ia = 3877 ic = 29573 normalize n = (fromIntegral n) * (1.0 / fromIntegral im) main = do args <- getArgs let n = if (null args) then 1000 else read (head args) writeFasta "ONE" "Homo sapiens alu" (take (2*n) (cycle alu)) let (seq1,seq2) = splitAt (3*n) (probs 42) -- we have to match the imperative version let seq2' = take (5*n) seq2 -- instead of using the Haskell idiom writeFasta "TWO" "IUB ambiguity codes" (map (chooseBase iub) seq1) writeFasta "THREE" "Homo sapiens frequency" (map (chooseBase homosapiens) seq2')
