Difference between revisions of "HXT/Practical/WebSpider"

From HaskellWiki
< HXT‎ | Practical
Jump to navigation Jump to search
(updated for HXT-9)
(Use block markup for multiline code)
 
Line 1: Line 1:
<hask>
+
<haskell>
 
module Main where
 
module Main where
   
Line 145: Line 145:
 
(3,0,1) -> Moved $ findHeader HdrLocation response
 
(3,0,1) -> Moved $ findHeader HdrLocation response
 
_ -> Error $ rspReason response
 
_ -> Error $ rspReason response
</hask>
+
</haskell>

Latest revision as of 17:10, 11 October 2011

module Main where

import Data.List
import Data.Maybe
import qualified Data.Set as Set
import System
import Network.HTTP
import Network.URI
import Text.XML.HXT.Core
import Text.XML.HXT.Curl

type MyArrow b c = IOStateArrow (Set.Set String) b c

main = do [url] <- getArgs
          lines <- runUrl url
          mapM_ putStrLn lines

split = arr (\x -> (x,x))

{- runUrl takes a seed URL and starts spidering from there, returning a
 - list of validation or other errors.
 -}
runUrl :: String -> IO [String]
runUrl url = runX (constA url
                   >>> setTraceLevel 0
                   >>> withOtherUserState Set.empty
                       (split
                        >>> checkUrl
                        >. unlines
                        >>> perform (getUserState
                                     >>> Set.size
                                     ^>> (trace 0 $ arr (\x -> "Checked " ++ show x ++ " urls")))
                       )
                  )

{- checkUrl is an arrow taking as input a pair (url, base), and producing
 - a list of errors encountered.  The url could be relative to the base.
 - This arrow will recursively check additional urls encountered in local
 - URLs.  Remote URLs are only checked for status.
 -}
checkUrl :: MyArrow (String,String) String
checkUrl = clearErrStatus
           >>>
           first normalizeUrl
           >>>
           ifA (first seenUrl)
               (fst ^>> traceString 1 ("Skipping (seen already) " ++) >>> none)
               (first markSeen
                >>>
                ifP isLocalHtml
                    validateAndSpiderUrl
                    checkUrlStatus)

{- readFromDocument uses the external curl program because of a file
 - descriptor leak in either Network.HTTP or HXT
 -}
validateAndSpiderUrl :: MyArrow (String, String) String
validateAndSpiderUrl = arr (\(x,y) -> (x,x))
                       >>>
                       first ( traceString 0 ("Validating " ++)
                               >>> readFromDocument [ withCurl []]
                               >>> selectLinks
                               >>> traceString 1 ("Found link: " ++)
                             )
--                     >>> arr (\(x,y) -> ((x,y),y)) >>> first expandURI
                       >>> (this &&& arr snd) >>> first expandURI
                       >>>
                       checkUrl


seenUrl :: MyArrow String String
seenUrl = split >>> second getUserState
          >>> (uncurry Set.member) `guardsP` (arr fst)

markSeen :: MyArrow String String
markSeen = changeUserState Set.insert

normalizeUrl = arrL (maybeToList . removeFragment)
    where removeFragment u = do uri <- parseURIReference u
                                return $ show uri { uriFragment = ""}

selectLinks :: ArrowXml a => a XmlTree String
selectLinks = deep (isElem
                    >>> hasName "a"
                    >>> getAttrValue "href"
                    >>> mkText)
              >>> getText

{- Note that we already expanded any relative URLs -}
isLocalHtml :: (String, String) -> Bool
isLocalHtml (url, base) = haveSameHost url base && isHtmlUrl url
    where haveSameHost a b = fromMaybe False
                                (do urlA <- parseURI a
                                    urlB <- parseURI b
                                    authA <- uriAuthority urlA
                                    authB <- uriAuthority urlB
                                    return $ uriRegName authA == uriRegName authB
                                )
          isHtmlUrl url = isSuffixOf ".html" url || isSuffixOf "/" url

{- Checks the status of a url and returns an error message if anything
 - other than a 200 OK response results.
 -}
checkUrlStatus :: MyArrow (String,String) String
checkUrlStatus = first (traceString 0 ("Checking status of " ++)
                        >>> arrIO (responseCode) >>> arrL (maybeToList))
                 >>> arr (\(u,b) -> u ++ " (linked from " ++ b ++ ")")

responseCode :: String -> IO (Maybe String)
responseCode url = case parseURIReference url of
                       Nothing -> return $ Just ("Bad URL: " ++ url)
                       Just uri -> if uriScheme uri == "http:"
                                       then catch (responseCode' uri)
                                                (\e -> return $ Just (show uri ++ ": " ++ show e))
                                       else return $ Nothing
    where responseCode' uri =
            -- HEAD would be sufficient except that some sites
            -- (like Amazon) disallow it :-(
            do result <- simpleHTTP $ Request uri GET [] ""
               return $ either (Just . ((show uri ++ ": ") ++) . show)
                               (responseMessage uri)
                               result
          responseMessage uri response =
              case classifyResponse response of
                  Success      -> Nothing
                  Error reason -> Just (show uri ++ ": " ++ reason)
                  Moved loc    -> Just (show uri ++ ": moved to " ++
                                        (fromMaybe "unknown location" loc))


-- Network.HTTP v3000.0.0 forgot to expose this type
type ResponseCode = (Int, Int, Int)

data HttpResponseType = Success
                      | Moved (Maybe String)   -- new location
                      | Error String  -- reason

classifyResponse :: Response -> HttpResponseType
classifyResponse response =
    case rspCode response of
        (2,0,0) -> Success
        (3,0,2) -> Success   -- Found
        (3,0,7) -> Success   -- Temporary Redirect
        (3,0,1) -> Moved $ findHeader HdrLocation response
        _       -> Error $ rspReason response