# Toy compression implementations

### From HaskellWiki

(Difference between revisions)

m |
(use laziness and HOFs to dramatically shrink lzw funcs) |
||

(3 intermediate revisions by 2 users not shown) | |||

Line 1: | Line 1: | ||

[[Category:Code]] |
[[Category:Code]] |
||

+ | |||

+ | == About == |
||

This code is provided in the hope that someone might find it interesting/entertaining, and to demonstrate what an excellent programming language Haskell truly is. (A working polymorphic LZW implementation in 10 lines? Try ''that'' in Java!) |
This code is provided in the hope that someone might find it interesting/entertaining, and to demonstrate what an excellent programming language Haskell truly is. (A working polymorphic LZW implementation in 10 lines? Try ''that'' in Java!) |
||

Line 6: | Line 8: | ||

[[User:MathematicalOrchid|MathematicalOrchid]] 16:46, 15 February 2007 (UTC) |
[[User:MathematicalOrchid|MathematicalOrchid]] 16:46, 15 February 2007 (UTC) |
||

+ | |||

+ | == Main module == |
||

<haskell> |
<haskell> |
||

module Compression where |
module Compression where |
||

− | import Data.List |
+ | import List |

+ | import Maybe |
||

+ | import IO (hFlush, stdout) |
||

+ | chars = [' '..'~'] -- Becuase ' ' = 0x20 and '~' = 0x7F. |
||

-- Run-length encoding |
-- Run-length encoding |
||

− | encode_RLE :: (Eq x) => [x] -> [(Int,x)] |
+ | encode_RLE :: (Eq t) => [t] -> [(Int,t)] |

encode_RLE = map (\xs -> (length xs, head xs)) . groupBy (==) |
encode_RLE = map (\xs -> (length xs, head xs)) . groupBy (==) |
||

− | decode_RLE :: [(Int,x)] -> [x] |
+ | decode_RLE :: [(Int,t)] -> [t] |

decode_RLE = concatMap (uncurry replicate) |
decode_RLE = concatMap (uncurry replicate) |
||

− | -- Limpel-Ziv-Welsh compression (Recommend using [Word8] or [SmallAlpha] for input!) |
+ | -- Limpel-Ziv-Welch encoding |

− | encode_LZW :: (Eq x, Enum x, Bounded x) => [x] -> [Int] |
+ | encode_LZW :: (Eq t) => [t] -> [t] -> [Int] |

− | encode_LZW [] = [] |
+ | encode_LZW alphabet = work (map (:[]) alphabet) where |

− | encode_LZW (x:xs) = work init [x] xs where |
+ | chunk pred lst = last . takeWhile (pred . fst) . tail $ zip (inits lst) (tails lst) |

− | init = map (\x -> [x]) $ enumFromTo minBound maxBound |
+ | work table [] = [] |

− | work table buffer [] = [maybe undefined id (elemIndex buffer table)] |
+ | work table lst = fromJust (elemIndex tok table) : work (table ++ [tok ++ [head rst]]) rst |

− | work table buffer (x:xs) = |
+ | where (tok, rst) = chunk (`elem` table) lst |

− | let new = buffer ++ [x] |
||

− | in case elemIndex new table of |
||

− | Nothing -> maybe undefined id (elemIndex buffer table) : work (table ++ [new]) [x] xs |
||

− | Just _ -> work table new xs |
||

− | -- TODO: Matching decode_LZW function. |
+ | decode_LZW :: [t] -> [Int] -> [t] |

+ | decode_LZW alphabet xs = concat output where |
||

+ | output = map (table !!) xs |
||

+ | table = map (:[]) alphabet ++ zipWith (++) output (map (take 1) (tail output)) |
||

− | -- TODO: Huffman encoding. |
+ | main = do x <- take 20000 `fmap` readFile "/usr/share/dict/words" |

+ | let l = length x `div` 80 |
||

+ | a = ['\0' .. '\255'] |
||

+ | eq a b | a == b = putChar '=' >> hFlush stdout |
||

+ | | otherwise = error "data error" |
||

+ | cmp = zipWith eq x . decode_LZW a . encode_LZW a $ x |
||

+ | vl = map head $ unfoldr (\cm -> case cm of [] -> Nothing ; _ -> Just (splitAt l cm)) cmp |
||

+ | sequence_ vl |
||

− | -- TODO: Arithmetic coding. |
||

</haskell> |
</haskell> |
||

− | It may also be useful to add the following for test purposes: |
+ | Some examples are in order: |

<haskell> |
<haskell> |
||

− | import Data.Word |
+ | > encode_RLE "AAAABBBBDDCCCCEEEGGFFFF" |

− | data SmallAlpha = AA | BB | CC | DD deriving (Show, Eq, Ord, Enum, Bounded) |
+ | [(4,'A'),(4,'B'),(2,'D'),(4,'C'),(3,'E'),(2,'G'),(4,'F')] |

− | parse1 'a' = AA |
||

− | parse1 'b' = BB |
||

− | parse1 'c' = CC |
||

− | parse1 _ = DD -- For safety |
||

− | parse = map parse1 |
+ | > decode_RLE [(4,'A'),(4,'B'),(2,'D'),(4,'C'),(3,'E'),(2,'G'),(4,'F')] |

+ | |||

+ | "AAAABBBBDDCCCCEEEGGFFFF" |
||

+ | |||

+ | |||

+ | > encode_LZW chars "This is just a simple test." |
||

+ | |||

+ | [52,72,73,83,0,97,0,74,85,83,84,0,65,0,83,73,77,80,76,69,0,84,69,104,14] |
||

+ | |||

+ | |||

+ | > decode_LZW chars [52,72,73,83,0,97,0,74,85,83,84,0,65,0,83,73,77,80,76,69,0,84,69,104,14] |
||

+ | |||

+ | "This is just a simple test." |
||

</haskell> |
</haskell> |
||

− | Anybody know how to use <hask>newtype</hask> to make a type like <hask>Char</hask> but with <hask>minBound</hask> and <hask>maxBound</hask> much closer together? |
+ | == Huffman coding == |

+ | |||

+ | <haskell> |
||

+ | module Huffman |
||

+ | (count, markov1, Tree, encode_huffman, decode_huffman) |
||

+ | where |
||

+ | |||

+ | import Data.List (nub) |
||

+ | |||

+ | -- Marvok1 probability model... |
||

+ | |||

+ | count :: (Eq t) => [t] -> [(t,Int)] |
||

+ | count xs = map (\x -> (x, length $ filter (x ==) xs)) $ nub xs |
||

+ | |||

+ | markov1 :: (Eq t) => [t] -> [(t,Double)] |
||

+ | markov1 xs = |
||

+ | let n = fromIntegral $ length xs |
||

+ | in map (\(x,c) -> (x, fromIntegral c / n)) $ count xs |
||

+ | |||

+ | |||

+ | -- Build a Huffman tree... |
||

+ | |||

+ | data Tree t = Leaf Double t | Branch Double (Tree t) (Tree t) deriving Show |
||

+ | |||

+ | prob :: Tree t -> Double |
||

+ | prob (Leaf p _) = p |
||

+ | prob (Branch p _ _) = p |
||

+ | |||

+ | get_tree :: [Tree t] -> (Tree t, [Tree t]) |
||

+ | get_tree (t:ts) = work t [] ts where |
||

+ | work x xs [] = (x,xs) |
||

+ | work x xs (y:ys) |
||

+ | | prob y < prob x = work y (x:xs) ys |
||

+ | | otherwise = work x (y:xs) ys |
||

+ | |||

+ | huffman_build :: [(t,Double)] -> Tree t |
||

+ | huffman_build = build . map (\(t,p) -> Leaf p t) where |
||

+ | build [t] = t |
||

+ | build ts = |
||

+ | let (t0,ts0) = get_tree ts |
||

+ | (t1,ts1) = get_tree ts0 |
||

+ | in build $ Branch (prob t0 + prob t1) t0 t1 : ts1 |
||

+ | |||

+ | |||

+ | -- Make codebook... |
||

+ | |||

+ | data Bit = Zero | One deriving (Eq, Show) |
||

+ | type Bits = [Bit] |
||

+ | |||

+ | huffman_codebook :: Tree t -> [(t,Bits)] |
||

+ | huffman_codebook = work [] where |
||

+ | work bs (Leaf _ x) = [(x,bs)] |
||

+ | work bs (Branch _ t0 t1) = work (bs ++ [Zero]) t0 ++ work (bs ++ [One]) t1 |
||

+ | |||

+ | |||

+ | -- Do the coding! |
||

+ | |||

+ | encode :: (Eq t) => [(t,Bits)] -> [t] -> Bits |
||

+ | encode cb = concatMap (\x -> maybe undefined id $ lookup x cb) |
||

+ | |||

+ | decode :: (Eq t) => Tree t -> Bits -> [t] |
||

+ | decode t = work t t where |
||

+ | work _ (Leaf _ x) [] = [x] |
||

+ | work t (Leaf _ x) bs = x : work t t bs |
||

+ | work t (Branch _ t0 t1) (b:bs) |
||

+ | | b == Zero = work t t0 bs |
||

+ | | otherwise = work t t1 bs |
||

+ | |||

+ | encode_huffman :: (Eq t) => [t] -> (Tree t, Bits) |
||

+ | encode_huffman xs = |
||

+ | let t = huffman_build $ markov1 xs |
||

+ | bs = encode (huffman_codebook t) xs |
||

+ | in (t,bs) |
||

+ | |||

+ | decode_huffman :: (Eq t) => Tree t -> Bits -> [t] |
||

+ | decode_huffman = decode |
||

+ | </haskell> |
||

+ | |||

+ | If anybody can make this code shorter / more elegant, feel free! |
||

+ | |||

+ | A short demo: |
||

+ | <haskell> |
||

+ | > encode_huffman "this is just a simple test" |
||

+ | <loads of data> |
||

+ | |||

+ | > decode_huffman (fst it) (snd it) |
||

+ | "this is just a simple test" |
||

+ | </haskell> |

## Latest revision as of 01:59, 9 March 2007

## [edit] 1 About

This code is provided in the hope that someone might find it interesting/entertaining, and to demonstrate what an excellent programming language Haskell truly is. (A working polymorphic LZW implementation in 10 lines? Try *that* in Java!)

This is 'toy' code. Please don't try to use it to compress multi-GB of data. It has not been thoroughly checked for correctness, and I shudder to think what the time and space complexity would be like! However, it is enlightening and entertaining to see how many algorithms you can implement with a handful of lines...

MathematicalOrchid 16:46, 15 February 2007 (UTC)

## [edit] 2 Main module

module Compression where import List import Maybe import IO (hFlush, stdout) chars = [' '..'~'] -- Becuase ' ' = 0x20 and '~' = 0x7F. -- Run-length encoding encode_RLE :: (Eq t) => [t] -> [(Int,t)] encode_RLE = map (\xs -> (length xs, head xs)) . groupBy (==) decode_RLE :: [(Int,t)] -> [t] decode_RLE = concatMap (uncurry replicate) -- Limpel-Ziv-Welch encoding encode_LZW :: (Eq t) => [t] -> [t] -> [Int] encode_LZW alphabet = work (map (:[]) alphabet) where chunk pred lst = last . takeWhile (pred . fst) . tail $ zip (inits lst) (tails lst) work table [] = [] work table lst = fromJust (elemIndex tok table) : work (table ++ [tok ++ [head rst]]) rst where (tok, rst) = chunk (`elem` table) lst decode_LZW :: [t] -> [Int] -> [t] decode_LZW alphabet xs = concat output where output = map (table !!) xs table = map (:[]) alphabet ++ zipWith (++) output (map (take 1) (tail output)) main = do x <- take 20000 `fmap` readFile "/usr/share/dict/words" let l = length x `div` 80 a = ['\0' .. '\255'] eq a b | a == b = putChar '=' >> hFlush stdout | otherwise = error "data error" cmp = zipWith eq x . decode_LZW a . encode_LZW a $ x vl = map head $ unfoldr (\cm -> case cm of [] -> Nothing ; _ -> Just (splitAt l cm)) cmp sequence_ vl

Some examples are in order:

> encode_RLE "AAAABBBBDDCCCCEEEGGFFFF" [(4,'A'),(4,'B'),(2,'D'),(4,'C'),(3,'E'),(2,'G'),(4,'F')] > decode_RLE [(4,'A'),(4,'B'),(2,'D'),(4,'C'),(3,'E'),(2,'G'),(4,'F')] "AAAABBBBDDCCCCEEEGGFFFF" > encode_LZW chars "This is just a simple test." [52,72,73,83,0,97,0,74,85,83,84,0,65,0,83,73,77,80,76,69,0,84,69,104,14] > decode_LZW chars [52,72,73,83,0,97,0,74,85,83,84,0,65,0,83,73,77,80,76,69,0,84,69,104,14] "This is just a simple test."

## [edit] 3 Huffman coding

module Huffman (count, markov1, Tree, encode_huffman, decode_huffman) where import Data.List (nub) -- Marvok1 probability model... count :: (Eq t) => [t] -> [(t,Int)] count xs = map (\x -> (x, length $ filter (x ==) xs)) $ nub xs markov1 :: (Eq t) => [t] -> [(t,Double)] markov1 xs = let n = fromIntegral $ length xs in map (\(x,c) -> (x, fromIntegral c / n)) $ count xs -- Build a Huffman tree... data Tree t = Leaf Double t | Branch Double (Tree t) (Tree t) deriving Show prob :: Tree t -> Double prob (Leaf p _) = p prob (Branch p _ _) = p get_tree :: [Tree t] -> (Tree t, [Tree t]) get_tree (t:ts) = work t [] ts where work x xs [] = (x,xs) work x xs (y:ys) | prob y < prob x = work y (x:xs) ys | otherwise = work x (y:xs) ys huffman_build :: [(t,Double)] -> Tree t huffman_build = build . map (\(t,p) -> Leaf p t) where build [t] = t build ts = let (t0,ts0) = get_tree ts (t1,ts1) = get_tree ts0 in build $ Branch (prob t0 + prob t1) t0 t1 : ts1 -- Make codebook... data Bit = Zero | One deriving (Eq, Show) type Bits = [Bit] huffman_codebook :: Tree t -> [(t,Bits)] huffman_codebook = work [] where work bs (Leaf _ x) = [(x,bs)] work bs (Branch _ t0 t1) = work (bs ++ [Zero]) t0 ++ work (bs ++ [One]) t1 -- Do the coding! encode :: (Eq t) => [(t,Bits)] -> [t] -> Bits encode cb = concatMap (\x -> maybe undefined id $ lookup x cb) decode :: (Eq t) => Tree t -> Bits -> [t] decode t = work t t where work _ (Leaf _ x) [] = [x] work t (Leaf _ x) bs = x : work t t bs work t (Branch _ t0 t1) (b:bs) | b == Zero = work t t0 bs | otherwise = work t t1 bs encode_huffman :: (Eq t) => [t] -> (Tree t, Bits) encode_huffman xs = let t = huffman_build $ markov1 xs bs = encode (huffman_codebook t) xs in (t,bs) decode_huffman :: (Eq t) => Tree t -> Bits -> [t] decode_huffman = decode

If anybody can make this code shorter / more elegant, feel free!

A short demo:

> encode_huffman "this is just a simple test" <loads of data> > decode_huffman (fst it) (snd it) "this is just a simple test"