Serialisation and compression with Data Binary

From HaskellWiki
Jump to navigation Jump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

An example showing how to *efficiently* serialise data, compress it, and pass it to C, using Data.Binary and the zlib binding.

{-# OPTIONS -fglasgow-exts #-}

{-
An example showing how to:

    * Use the FFI
    * Compress streaming data
    * Serialise to and from disk
    * Stream lazy bytestrings efficiently

We will use
    * Foreign.* to generate the data
    * Wrap it as a lazy bytestring
    * Data.Binary to serialise it
    * Code.Compression.Gzip to compress/uncompress
    * Pass it to C and make a simple FFI call on the result
    * Display the result

Running:
    $ ghc -O2 A.hs --make

    $ time ./A                
    Built table
    Compressed      25600000 bytes
    Compressed size  2231545 bytes (91.28%)
    Decompressed    25600000 bytes
    Calling into C ...
    -8.742278e-8
    -0.6865875
    -0.7207948
    -0.1401903
    0.63918984
    0.7437966
    0.27236375
    -0.5763547
    -0.75708854
    -0.39026973
    ./A  2.98s user 0.11s system 94% cpu 3.275 total
-}

-- 
-- Some imports
--
import Foreign
import Foreign.C.Types
import Data.Int

import qualified Data.ByteString.Lazy     as L
import qualified Data.ByteString.Internal as S
import qualified Data.ByteString          as S

import Data.Binary
import Codec.Compression.GZip

import System.IO
import Text.Printf
import Control.Monad

------------------------------------------------------------------------
-- Foreign Ptrs
--
-- A simple wrapper type
--
data Table = Table { floats :: ForeignPtr CFloat
                   , ints   :: ForeignPtr Int    }

-- Statically fixed sizes
floatSize = 4800000
intSize   = 1600000

totalBytes = sizeOf (undefined :: CFloat) * floatSize
           + sizeOf (undefined :: Int)    * intSize

--
-- Build a table populated with some defaults
-- Float table filled with 'pi' , ints numbered consecutively
--
newTable :: IO Table
newTable = do
    fp <- S.mallocByteString (floatSize * sizeOf (undefined :: CFloat))
    ip <- S.mallocByteString (intSize   * sizeOf (undefined :: Int   ))
    withForeignPtr fp $ \p ->
        forM_ [0..floatSize-1] $ \n ->
            pokeElemOff p n pi
    withForeignPtr ip $ \p ->
        forM_ [0..intSize-1]   $ \n ->
            pokeElemOff p n n
    return (Table fp ip)

------------------------------------------------------------------------
-- Lazy ByteStrings
--
-- Convert ForeignPtr a to and from a lazy ByteString
--
toByteString   :: Storable a => ForeignPtr a -> Int -> L.ByteString
toByteString (fp :: ForeignPtr a) n =
    L.fromChunks . (:[]) $ S.fromForeignPtr (castForeignPtr fp) 0 
                                            (n * sizeOf (undefined :: a))

--
-- Flatten a lazy bytestring back to a ForeignPtr.
--
fromByteString :: Storable a => L.ByteString -> ForeignPtr a
fromByteString lbs = castForeignPtr fp
   where (fp,_,n) = S.toForeignPtr . S.concat $ L.toChunks lbs

------------------------------------------------------------------------
-- GZip and Data.Binary
--
-- Serialise a Table, compressing with gzip it as we go:
--
instance Binary Table where
    put (Table f i) = do
        put . compress . toByteString f $ floatSize
        put . compress . toByteString i $ intSize

    get = do
        fs <- liftM decompress get
        is <- liftM decompress get

        -- check we read the correct amount:
        if L.length fs + L.length is == fromIntegral totalBytes
            then return $ Table (fromByteString fs) (fromByteString is)
            else error "Partial read"

------------------------------------------------------------------------
-- FFI
--
-- Example call to process the data using C functions.
--
rounded :: Int -> ForeignPtr CFloat -> IO [CFloat]
rounded l fp = withForeignPtr fp $ \p -> go p
    where
        go p = forM [0..l-1] $ \n -> do
                    v <- peekElemOff p n
                    return $ c_tanhf (c_sinf (v + fromIntegral n))

-- A random C function to use:    
foreign import ccall unsafe "math.h sinf"  c_sinf  :: CFloat -> CFloat
foreign import ccall unsafe "math.h tanhf" c_tanhf :: CFloat -> CFloat


------------------------------------------------------------------------
--
-- Now glue it all together
-- 
main = do
    table <- newTable
    putStrLn "Built table"

    -- write the data to disk, compressed with gzip as we go.
    encodeFile "/tmp/table.gz" table
    printf "Compressed      %d bytes\n" totalBytes

    -- how good was the compression?
    h <- openFile "/tmp/table.gz" ReadMode
    n <- hFileSize h
    hClose h
    printf "Compressed size  %d bytes (%0.2f%%)\n" n
                (100 - (fromIntegral n/fromIntegral totalBytes*100) :: Double)

    -- load it back in, decompressing on the fly
    table' <- decodeFile "/tmp/table.gz"
    printf "Decompressed    %d bytes\n" totalBytes

    -- now process the floats with C
    printf "Calling into C ...\n"
    ps <- rounded 10 (floats table')
    forM_ ps print