Personal tools

Haskell Quiz/Bytecode Compiler/Solution Justin Bailey

From HaskellWiki

< Haskell Quiz | Bytecode Compiler(Difference between revisions)
Jump to: navigation, search
(Updated to corrrect code)
Line 1: Line 1:
 
[[Category:Code]]
 
[[Category:Code]]
This solution isn't the cleanest or quite tested, but if you load it in hugs and run:
+
This solution should work correctly. I was unable to test the byte codes generated, for obvious reasons. However, all test strings from the quiz do evaluate to the correct values.
   
<haskell>
+
To see the (symbolic) byte codes generated, run <hask>generate_tests</hask>. To see the actual byte codes, run <hask>compile_tests</hask>. To see that the values produced by each expression match those expected, run <hask>eval_tests</hask>. The tests are contained in the variables <hask>test1,test2, ..., test6</hask>, which correspond to the six "test_n" methods fouind in the quiz's test program.
compile $ parse $ tokenize $ stmt1
 
compile $ parse $ tokenize $ stmt2
 
...
 
compile $ parse $ tokenize $ stmt8
 
</haskell>
 
 
You'll see the byte codes generated. I didn't implement any optimizations (i.e. using SWAP).
 
 
Sadly, the parsing algorithm does not preserve precedence correctly. For example, parsing "2*2+2" produces
 
 
<haskell>
 
> parse $ tokenize "2*2+2"
 
Statement Mult (Statement Plus (Val 2) (Val 2)) (Val 2)
 
</haskell>
 
   
Which is incorrect. Any suggestions?
+
The byte codes aren't optimized. For example, SWAP is never used. However, they should produce correct results (even for negative and LCONST/CONST values).
   
 
The code below is literate Haskell.
 
The code below is literate Haskell.
Line 10: Line 10:
 
<haskell>
 
<haskell>
 
\begin{code}
 
\begin{code}
import Data.Char(isAlpha, isDigit, isSeparator)
+
import Text.ParserCombinators.Parsec hiding (parse)
  +
import qualified Text.ParserCombinators.Parsec as P (parse)
  +
import Text.ParserCombinators.Parsec.Expr
  +
import Data.Bits
   
data Op = Plus | Minus | Mult | Div | Pow | Mod
+
-- Represents various operations that can be applied
deriving (Show, Eq)
+
-- to expressions.
+
data Op = Plus | Minus | Mult | Div | Pow | Mod | Neg
data Token = RightParen
 
| LeftParen
 
| Number Integer
 
| Operator Op
 
 
deriving (Show, Eq)
 
deriving (Show, Eq)
   
  +
-- Represents expression we can build - either numbers or expressions
  +
-- connected by operators.
 
data Expression = Statement Op Expression Expression
 
data Expression = Statement Op Expression Expression
 
| Val Integer
 
| Val Integer
Line 22: Line 24:
 
deriving (Show)
 
deriving (Show)
   
  +
-- Define the byte codes that can be generated.
 
data Bytecode = NOOP | CONST Integer | LCONST Integer
 
data Bytecode = NOOP | CONST Integer | LCONST Integer
 
| ADD
 
| ADD
Line 32: Line 35:
 
deriving (Show)
 
deriving (Show)
   
expr1 = Statement Plus (Val 1) (Val 2)
 
expr2 = Statement Mult (Statement Plus (Val 1) (Val 2)) (Val 3)
 
   
-- Take a statement and evaluate
+
-- Using imported Parsec.Expr library, build a parser for expressions.
  +
expr :: Parser Expression
  +
expr =
  +
buildExpressionParser table factor
  +
<?> "expression"
  +
where
  +
-- Recognizes a factor in an expression
  +
factor =
  +
do{ char '('
  +
; x <- expr
  +
; char ')'
  +
; return x
  +
}
  +
<|> number
  +
<?> "simple expression"
  +
-- Recognizes a number
  +
number :: Parser Expression
  +
number = do{ ds <- many1 digit
  +
; return (Val (read ds))
  +
}
  +
<?> "number"
  +
-- Specifies operator, associativity, precendence, and constructor to execute
  +
-- and built AST with.
  +
table =
  +
[[prefix "-" (Statement Mult (Val (-1)))],
  +
[binary "^" (Statement Pow) AssocRight],
  +
[binary "*" (Statement Mult) AssocLeft, binary "/" (Statement Div) AssocLeft, binary "%" (Statement Mod) AssocLeft],
  +
[binary "+" (Statement Plus) AssocLeft, binary "-" (Statement Minus) AssocLeft]
  +
]
  +
where
  +
binary s f assoc
  +
= Infix (do{ string s; return f}) assoc
  +
prefix s f
  +
= Prefix (do{ string s; return f})
  +
  +
  +
-- Parses a string into an AST, using the parser defined above
  +
parse s = case P.parse expr "" s of
  +
Right ast -> ast
  +
Left e -> error $ show e
  +
  +
-- Take AST and evaluate (mostly for testing)
 
eval (Val n) = n
 
eval (Val n) = n
 
eval (Statement op left right)
 
eval (Statement op left right)
Line 45: Line 46:
 
| op == Mod = eval left `mod` eval right
 
| op == Mod = eval left `mod` eval right
   
-- Take a string and tokenize it
+
-- Takes an AST and turns it into a byte code list
tokenize [] = []
+
generate stmt = generate' stmt []
tokenize ('(':xs) = LeftParen : tokenize xs
+
where
tokenize (')':xs) = RightParen : tokenize xs
+
generate' (Statement op left right) instr =
tokenize (x:xs) | isDigit x = let (val, remaining) = span isDigit (x:xs)
+
let
in Number (read val) : tokenize remaining
+
li = generate' left instr
| isOperator x = toOperator x : tokenize xs
+
ri = generate' right instr
| isSeparator x = tokenize (dropWhile isSeparator xs)
+
lri = li ++ ri
where
+
in case op of
makeOp '+' = Just $ Operator Plus
+
Plus -> lri ++ [ADD]
makeOp '-' = Just $ Operator Minus
+
Minus -> lri ++ [SUB]
makeOp '*' = Just $ Operator Mult
+
Mult -> lri ++ [MUL]
makeOp '/' = Just $ Operator Div
+
Div -> lri ++ [DIV]
makeOp '^' = Just $ Operator Pow
+
Mod -> lri ++ [MOD]
makeOp '%' = Just $ Operator Mod
+
Pow -> lri ++ [POW]
makeOp _ = Nothing
+
generate' (Val n) instr =
toOperator x = case makeOp x of
+
if abs(n) > 32768
Just x -> x
+
then instr ++ [LCONST n]
_ -> error "Bad operator."
+
else instr ++ [CONST n]
isOperator x = case makeOp x of
 
Just x -> True
 
_ -> False
 
   
stmt1 = "1 + 2" -- 3
+
-- Takes a statement and converts it into a list of actual bytes to
stmt2 = "1 + 2 * 3" -- 7
+
-- be interpreted
stmt3 = "(1 + 2) * 3" -- 9
+
compile s = toBytes (generate $ parse s)
stmt4 = "4 - 5 * 2" -- -6
 
stmt5 = "5 * (2 - 4)" -- -10
 
stmt6 = "(1*3)*4*(5*6)" -- 360
 
stmt7 = "2^(2+(3/2)^2)" -- 8
 
stmt8 = "(10%3)*(2+2)" -- 4
 
   
{-
+
-- Convert a list of byte codes to a list of integer codes. If LCONST or CONST
Based on http://www.engr.mun.ca/~theo/Misc/exp_parsing.htm
+
-- instruction are seen, correct byte representantion is produced
  +
toBytes ((NOOP):xs) = 0 : toBytes xs
  +
toBytes ((CONST n):xs) = 1 : (toConstBytes (fromInteger n)) ++ toBytes xs
  +
toBytes ((LCONST n):xs) = 2 : (toLConstBytes (fromInteger n)) ++ toBytes xs
  +
toBytes ((ADD):xs) = 0x0a : toBytes xs
  +
toBytes ((SUB):xs) = 0x0b : toBytes xs
  +
toBytes ((MUL):xs) = 0x0c : toBytes xs
  +
toBytes ((POW):xs) = 0x0d : toBytes xs
  +
toBytes ((DIV):xs) = 0x0e : toBytes xs
  +
toBytes ((MOD):xs) = 0x0f : toBytes xs
  +
toBytes ((SWAP):xs) = 0x0a : toBytes xs
  +
toBytes [] = []
   
E -> E + E
+
-- Convert number to CONST representation (2 element list)
E -> E * E
+
toConstBytes n = toByteList 2 n
E -> E / E
+
toLConstBytes n = toByteList 4 n
E -> E - E
 
E -> E % E
 
E -> E ^ E
 
E -> ( E )
 
E -> n
 
 
Transform to
 
 
E --> Exp(0)
 
Exp(p) --> P {B Exp(q)}
 
P --> "(" E ")" | v
 
B --> "+" | "-" | "*" |"/" | "^" | "%"
 
   
Precedence
+
-- Convert a number into a list of 8-bit bytes (big-endian/network byte order).
+
-- Make sure final list is size elements long
^ 3
+
toByteList :: Bits Int => Int -> Int -> [Int]
*, /, % 2
+
toByteList size n =
+, - 1
+
if (length bytes) < size
  +
then (replicate (size - (length bytes)) 0) ++ bytes
  +
else bytes
  +
where
  +
bytes = reverse $ toByteList' n
  +
-- for negative, and with signed bit and remove negative. Then continue recursion.
  +
toByteList' 0 = []
  +
toByteList' a | a < 0 = (a .&. 511) : toByteList' (abs(a) `shiftR` 8)
  +
| otherwise = (a .&. 255) : toByteList' (a `shiftR` 8)
   
-}
+
-- All tests defined by the quiz, with the associated values they should evaluate to.
  +
test1 = [(2+2, "2+2"), (2-2, "2-2"), (2*2, "2*2"), (2^2, "2^2"), (2 `div` 2, "2/2"),
  +
(2 `mod` 2, "2%2"), (3 `mod` 2, "3%2")]
   
-- define precdence of operators
+
test2 = [(2+2+2, "2+2+2"), (2-2-2, "2-2-2"), (2*2*2, "2*2*2"), (2^2^2, "2^2^2"), (4 `div` 2 `div` 2, "4/2/2"),
precedence Plus = 1
+
(7`mod`2`mod`1, "7%2%1")]
precedence Minus = 1
+
precedence Mult = 2
+
test3 = [(2+2-2, "2+2-2"), (2-2+2, "2-2+2"), (2*2+2, "2*2+2"), (2^2+2, "2^2+2"),
precedence Div = 3
+
(4 `div` 2+2, "4/2+2"), (7`mod`2+1, "7%2+1")]
precedence Mod = 2
 
precedence Pow = 3
 
   
-- Precedence comparison - gets precedence of
+
test4 = [(2+(2-2), "2+(2-2)"), (2-(2+2), "2-(2+2)"), (2+(2*2), "2+(2*2)"), (2*(2+2), "2*(2+2)"),
-- given operator and determines if its greater than the value given.
+
(2^(2+2), "2^(2+2)"), (4 `div` (2+2), "4/(2+2)"), (7`mod`(2+1), "7%(2+1)")]
(>*) op val = precedence op >= val
 
   
-- Precedence addition - for left associative operators,
+
test5 = [(-2+(2-2), "-2+(2-2)"), (2-(-2+2), "2-(-2+2)"), (2+(2 * -2), "2+(2*-2)")]
-- return its precedence + 1. For righ associative, just return the operators
 
-- precedence.
 
prec_add p@(Pow) = precedence p
 
prec_add p = 1 + precedence p
 
   
parse [] = error "Can't parse empty list of tokens"
+
test6 = [((3 `div` 3)+(8-2), "(3/3)+(8-2)"), ((1+3) `div` (2 `div` 2)*(10-8), "(1+3)/(2/2)*(10-8)"),
parse tokens = fst $ parseE tokens 0
+
((1*3)*4*(5*6), "(1*3)*4*(5*6)"), ((10`mod`3)*(2+2), "(10%3)*(2+2)"), (2^(2+(3 `div` 2)^2), "2^(2+(3/2)^2)"),
where parseE tokens prec
+
((10 `div` (2+3)*4), "(10/(2+3)*4)"), (5+((5*4)`mod`(2+1)), "5+((5*4)%(2+1))")]
= let (p, remaining) = parseP tokens prec
 
in
 
if remaining == []
 
then (p, remaining)
 
else case head remaining of
 
Operator op ->
 
if op >* prec
 
then
 
let (right, rest) = parseE (tail remaining) $ prec_add op
 
in (Statement op p right, rest)
 
else let (left, rest) = parseE (tail remaining) $ prec
 
in (Statement op left p, rest)
 
_ -> (p, remaining)
 
parseP ((Number n):ts) prec = (Val n, ts)
 
parseP ((LeftParen):ts) prec
 
= let (e, remaining) = parseE ts 0
 
in (e, tail remaining)
 
   
compile stmt = compile' stmt []
+
-- Evaluates the tests and makes sure the expressions match the expected values
  +
eval_tests = map eval_tests [test1, test2, test3, test4, test5, test6]
  +
where
  +
eval_tests ((val, stmt):ts) =
  +
let eval_val = eval $ parse stmt
  +
in
  +
if val == eval_val
  +
then "True" : eval_tests ts
  +
else (stmt ++ " evaluated incorrectly to " ++ show eval_val ++ " instead of " ++ show val) : eval_tests ts
  +
eval_tests [] = []
   
compile' (Statement op left right) instr =
+
-- Takes all the tests and displays symbolic bytes codes for each
let li = compile' left instr
+
generate_tests = map generate_all [test1,test2,test3,test4,test5,test6]
ri = compile' right instr
+
where generate_all ((val, stmt):ts) = generate (parse stmt) : generate_all ts
lri = li ++ ri
+
generate_all [] = []
in case op of
+
Plus -> lri ++ [ADD]
+
-- Takes all tests and generates a list of bytes representing them
Minus -> lri ++ [SUB]
+
compile_tests = map compile_all [test1,test2,test3,test4,test5,test6]
Mult -> lri ++ [MUL]
+
where compile_all ((val, stmt):ts) = compile stmt : compile_all ts
Div -> lri ++ [DIV]
+
compile_all [] = []
Mod -> lri ++ [MOD]
 
Pow -> lri ++ [POW]
 
compile' (Val n) instr = instr ++ [LCONST n]
 
   
\end{code}
+
\end{code}
 
</haskell>
 
</haskell>

Revision as of 23:43, 7 November 2006

This solution should work correctly. I was unable to test the byte codes generated, for obvious reasons. However, all test strings from the quiz do evaluate to the correct values.

To see the (symbolic) byte codes generated, run
generate_tests
. To see the actual byte codes, run
compile_tests
. To see that the values produced by each expression match those expected, run
eval_tests
. The tests are contained in the variables
test1,test2, ..., test6
, which correspond to the six "test_n" methods fouind in the quiz's test program.

The byte codes aren't optimized. For example, SWAP is never used. However, they should produce correct results (even for negative and LCONST/CONST values).

The code below is literate Haskell.

\begin{code}
import Text.ParserCombinators.Parsec hiding (parse)
import qualified Text.ParserCombinators.Parsec as P (parse)
import Text.ParserCombinators.Parsec.Expr
import Data.Bits
 
-- Represents various operations that can be applied
-- to expressions.
data Op = Plus | Minus | Mult | Div | Pow | Mod | Neg
  deriving (Show, Eq)
 
-- Represents expression we can build - either numbers or expressions
-- connected by operators.
data Expression = Statement Op Expression Expression
           | Val Integer
           | Empty
  deriving (Show)
 
-- Define the byte codes that can be generated. 
data Bytecode = NOOP | CONST Integer | LCONST Integer
            | ADD
            | SUB
            | MUL
            | POW
            | DIV
            | MOD
            | SWAP
  deriving (Show)
 
 
-- Using imported Parsec.Expr library, build a parser for expressions.
expr :: Parser Expression
expr =
  buildExpressionParser table factor
  <?> "expression"
  where
  -- Recognizes a factor in an expression
  factor  =
    do{ char '('
          ; x <- expr
          ; char ')'
          ; return x 
          }
      <|> number
      <?> "simple expression"
  -- Recognizes a number
  number  :: Parser Expression 
  number  = do{ ds <- many1 digit
              ; return (Val (read ds))
              }
          <?> "number"
  -- Specifies operator, associativity, precendence, and constructor to execute
  -- and built AST with.
  table =
    [[prefix "-" (Statement Mult (Val (-1)))],
      [binary "^" (Statement Pow) AssocRight],
      [binary "*" (Statement Mult) AssocLeft, binary "/" (Statement Div) AssocLeft, binary "%" (Statement Mod) AssocLeft],
      [binary "+" (Statement Plus) AssocLeft, binary "-" (Statement Minus) AssocLeft]
       ]          
    where
      binary s f assoc
         = Infix (do{ string s; return f}) assoc
      prefix s f 
         = Prefix (do{ string s; return f})
 
 
-- Parses a string into an AST, using the parser defined above
parse s = case P.parse expr "" s of
  Right ast -> ast
  Left e -> error $ show e
 
-- Take AST and evaluate (mostly for testing)
eval (Val n) = n
eval (Statement op left right)
        | op == Mult = eval left * eval right
        | op == Minus = eval left - eval right
        | op == Plus = eval left + eval right
        | op == Div = eval left `div` eval right
        | op == Pow = eval left ^ eval right
        | op == Mod = eval left `mod` eval right
 
-- Takes an AST and turns it into a byte code list
generate stmt = generate' stmt []
       where
               generate' (Statement op left right) instr =
                       let
                               li = generate' left instr
                               ri = generate' right instr
                               lri = li ++ ri
                       in case op of
                               Plus -> lri ++ [ADD]
                               Minus -> lri ++ [SUB]
                               Mult -> lri ++ [MUL]
                               Div -> lri ++ [DIV]
                               Mod -> lri ++ [MOD]
                               Pow -> lri ++ [POW]
               generate' (Val n) instr =
                if abs(n) > 32768
                then instr ++ [LCONST n]
                else instr ++ [CONST n]
 
-- Takes a statement and converts it into a list of actual bytes to
-- be interpreted
compile s = toBytes (generate $ parse s)
 
-- Convert a list of byte codes to a list of integer codes. If LCONST or CONST
-- instruction are seen, correct byte representantion is produced
toBytes ((NOOP):xs) = 0 : toBytes xs 
toBytes ((CONST n):xs) = 1 : (toConstBytes (fromInteger n)) ++ toBytes xs
toBytes ((LCONST n):xs) = 2 : (toLConstBytes (fromInteger n)) ++ toBytes xs
toBytes ((ADD):xs) = 0x0a : toBytes xs
toBytes ((SUB):xs) = 0x0b : toBytes xs
toBytes ((MUL):xs) = 0x0c : toBytes xs
toBytes ((POW):xs) = 0x0d : toBytes xs
toBytes ((DIV):xs) = 0x0e : toBytes xs
toBytes ((MOD):xs) = 0x0f : toBytes xs
toBytes ((SWAP):xs) = 0x0a : toBytes xs
toBytes [] = []
 
-- Convert number to CONST representation (2 element list)
toConstBytes n = toByteList 2 n 
toLConstBytes n = toByteList 4 n 
 
-- Convert a number into a list of 8-bit bytes (big-endian/network byte order).
-- Make sure final list is size elements long
toByteList ::  Bits Int => Int -> Int -> [Int]
toByteList size n =
    if (length bytes) < size
    then (replicate (size - (length bytes)) 0) ++ bytes
    else bytes
    where
      bytes = reverse $ toByteList' n
      -- for negative, and with signed bit and remove negative. Then continue recursion.
      toByteList' 0 = [] 
      toByteList' a | a < 0 = (a .&. 511) : toByteList' (abs(a) `shiftR` 8)
                    | otherwise = (a .&. 255) : toByteList' (a `shiftR` 8)
 
-- All tests defined by the quiz, with the associated values they should evaluate to.
test1 = [(2+2, "2+2"), (2-2, "2-2"), (2*2, "2*2"), (2^2, "2^2"), (2 `div` 2, "2/2"),
  (2 `mod` 2, "2%2"), (3 `mod` 2, "3%2")]
 
test2 = [(2+2+2, "2+2+2"), (2-2-2, "2-2-2"), (2*2*2, "2*2*2"), (2^2^2, "2^2^2"), (4 `div` 2 `div` 2, "4/2/2"),
  (7`mod`2`mod`1, "7%2%1")]
 
test3 = [(2+2-2, "2+2-2"), (2-2+2, "2-2+2"), (2*2+2, "2*2+2"), (2^2+2, "2^2+2"),
  (4 `div` 2+2, "4/2+2"), (7`mod`2+1, "7%2+1")]
 
test4 = [(2+(2-2), "2+(2-2)"), (2-(2+2), "2-(2+2)"), (2+(2*2), "2+(2*2)"), (2*(2+2), "2*(2+2)"),
  (2^(2+2), "2^(2+2)"), (4 `div` (2+2), "4/(2+2)"), (7`mod`(2+1), "7%(2+1)")]
 
test5 = [(-2+(2-2), "-2+(2-2)"), (2-(-2+2), "2-(-2+2)"), (2+(2 * -2), "2+(2*-2)")]
 
test6 = [((3 `div` 3)+(8-2), "(3/3)+(8-2)"), ((1+3) `div` (2 `div` 2)*(10-8), "(1+3)/(2/2)*(10-8)"), 
    ((1*3)*4*(5*6), "(1*3)*4*(5*6)"), ((10`mod`3)*(2+2), "(10%3)*(2+2)"), (2^(2+(3 `div` 2)^2), "2^(2+(3/2)^2)"),
    ((10 `div` (2+3)*4), "(10/(2+3)*4)"), (5+((5*4)`mod`(2+1)), "5+((5*4)%(2+1))")]
 
-- Evaluates the tests and makes sure the expressions match the expected values
eval_tests = map eval_tests [test1, test2, test3, test4, test5, test6]
  where
    eval_tests ((val, stmt):ts) =
      let eval_val = eval $ parse stmt 
      in
        if val == eval_val 
        then "True" : eval_tests ts
        else (stmt ++ " evaluated incorrectly to " ++ show eval_val ++ " instead of " ++ show val) : eval_tests ts
    eval_tests [] = []
 
-- Takes all the tests and displays symbolic bytes codes for each
generate_tests = map generate_all [test1,test2,test3,test4,test5,test6]
  where generate_all ((val, stmt):ts) = generate (parse stmt) : generate_all ts
        generate_all [] = []
 
-- Takes all tests and generates a list of bytes representing them
compile_tests = map compile_all [test1,test2,test3,test4,test5,test6]
  where compile_all ((val, stmt):ts) = compile stmt : compile_all ts
        compile_all [] = []
 
\end{code}