Personal tools

HXT/Practical/WebSpider

From HaskellWiki

Jump to: navigation, search
module Main where
 
import Data.List
import Data.Maybe
import qualified Data.Set as Set
import System
import Network.HTTP
import Network.URI
import Text.XML.HXT.Core
import Text.XML.HXT.Curl
 
type MyArrow b c = IOStateArrow (Set.Set String) b c
 
main = do [url] <- getArgs
          lines <- runUrl url
          mapM_ putStrLn lines
 
split = arr (\x -> (x,x))
 
{- runUrl takes a seed URL and starts spidering from there, returning a
 - list of validation or other errors.
 -}
runUrl :: String -> IO [String]
runUrl url = runX (constA url
                   >>> setTraceLevel 0
                   >>> withOtherUserState Set.empty
                       (split
                        >>> checkUrl
                        >. unlines
                        >>> perform (getUserState
                                     >>> Set.size
                                     ^>> (trace 0 $ arr (\x -> "Checked " ++ show x ++ " urls")))
                       )
                  )
 
{- checkUrl is an arrow taking as input a pair (url, base), and producing
 - a list of errors encountered.  The url could be relative to the base.
 - This arrow will recursively check additional urls encountered in local
 - URLs.  Remote URLs are only checked for status.
 -}
checkUrl :: MyArrow (String,String) String
checkUrl = clearErrStatus
           >>>
           first normalizeUrl
           >>>
           ifA (first seenUrl)
               (fst ^>> traceString 1 ("Skipping (seen already) " ++) >>> none)
               (first markSeen
                >>>
                ifP isLocalHtml
                    validateAndSpiderUrl
                    checkUrlStatus)
 
{- readFromDocument uses the external curl program because of a file
 - descriptor leak in either Network.HTTP or HXT
 -}
validateAndSpiderUrl :: MyArrow (String, String) String
validateAndSpiderUrl = arr (\(x,y) -> (x,x))
                       >>>
                       first ( traceString 0 ("Validating " ++)
                               >>> readFromDocument [ withCurl []]
                               >>> selectLinks
                               >>> traceString 1 ("Found link: " ++)
                             )
--                     >>> arr (\(x,y) -> ((x,y),y)) >>> first expandURI
                       >>> (this &&& arr snd) >>> first expandURI
                       >>>
                       checkUrl
 
 
seenUrl :: MyArrow String String
seenUrl = split >>> second getUserState
          >>> (uncurry Set.member) `guardsP` (arr fst)
 
markSeen :: MyArrow String String
markSeen = changeUserState Set.insert
 
normalizeUrl = arrL (maybeToList . removeFragment)
    where removeFragment u = do uri <- parseURIReference u
                                return $ show uri { uriFragment = ""}
 
selectLinks :: ArrowXml a => a XmlTree String
selectLinks = deep (isElem
                    >>> hasName "a"
                    >>> getAttrValue "href"
                    >>> mkText)
              >>> getText
 
{- Note that we already expanded any relative URLs -}
isLocalHtml :: (String, String) -> Bool
isLocalHtml (url, base) = haveSameHost url base && isHtmlUrl url
    where haveSameHost a b = fromMaybe False
                                (do urlA <- parseURI a
                                    urlB <- parseURI b
                                    authA <- uriAuthority urlA
                                    authB <- uriAuthority urlB
                                    return $ uriRegName authA == uriRegName authB
                                )
          isHtmlUrl url = isSuffixOf ".html" url || isSuffixOf "/" url
 
{- Checks the status of a url and returns an error message if anything
 - other than a 200 OK response results.
 -}
checkUrlStatus :: MyArrow (String,String) String
checkUrlStatus = first (traceString 0 ("Checking status of " ++)
                        >>> arrIO (responseCode) >>> arrL (maybeToList))
                 >>> arr (\(u,b) -> u ++ " (linked from " ++ b ++ ")")
 
responseCode :: String -> IO (Maybe String)
responseCode url = case parseURIReference url of
                       Nothing -> return $ Just ("Bad URL: " ++ url)
                       Just uri -> if uriScheme uri == "http:"
                                       then catch (responseCode' uri)
                                                (\e -> return $ Just (show uri ++ ": " ++ show e))
                                       else return $ Nothing
    where responseCode' uri =
            -- HEAD would be sufficient except that some sites
            -- (like Amazon) disallow it :-(
            do result <- simpleHTTP $ Request uri GET [] ""
               return $ either (Just . ((show uri ++ ": ") ++) . show)
                               (responseMessage uri)
                               result
          responseMessage uri response =
              case classifyResponse response of
                  Success      -> Nothing
                  Error reason -> Just (show uri ++ ": " ++ reason)
                  Moved loc    -> Just (show uri ++ ": moved to " ++
                                        (fromMaybe "unknown location" loc))
 
 
-- Network.HTTP v3000.0.0 forgot to expose this type
type ResponseCode = (Int, Int, Int)
 
data HttpResponseType = Success
                      | Moved (Maybe String)   -- new location
                      | Error String  -- reason
 
classifyResponse :: Response -> HttpResponseType
classifyResponse response =
    case rspCode response of
        (2,0,0) -> Success
        (3,0,2) -> Success   -- Found
        (3,0,7) -> Success   -- Temporary Redirect
        (3,0,1) -> Moved $ findHeader HdrLocation response
        _       -> Error $ rspReason response