HTML-Validator / Haskell / validator.hs
validator.hs
Raw
import System.IO  
import System.Exit
import Data.List

validTokens = ["html", "head", "body", "title", "h1", "h2", "h3", "p", "ul", "li", "a", "div", "br", "hr"]
validTokens' = ["/html", "/head", "/body", "/title", "/h1", "/h2", "/h3", "/p", "/ul", "/li", "/a", "/div", "/br", "/hr"]
nestables = ["h1", "h2", "h3", "p", "ul", "li", "a", "div"]
nestables' = ["/h1", "/h2", "/h3", "/p", "/ul", "/li", "/a", "/div"]
ignorables = ["br","hr"]

main ::  IO()
main = do
    filecontent <- readFile "file.html"
    let fileLines = filecontent 
    let tokenized = map cleanAttribute $ extractTokens fileLines
    checkTokens tokenized
    checkHeader tokenized
    let bodyTokens = isolateBody tokenized
    if bodyTokens == [] then errorWithoutStackTrace "Expected <body>...</body>" else checkBodyStructure (head bodyTokens) (last bodyTokens)
    let stack = push [] (tail $ init bodyTokens) -- push tokens onto the stack one by one
    if stack == [] then putStrLn "No errors detected" else putStrLn "There remmain unclosed tags" 
 
extractTokens []       = []
extractTokens ('<':xs) = let (curr,rest) = extractTokens' xs in curr:extractTokens rest
extractTokens (_  :xs) = extractTokens xs

extractTokens' []       = ([], []) 
extractTokens' ('>':xs) = ([], xs)
extractTokens' (x  :xs) = let (curr,rest) = extractTokens' xs in (x:curr, rest)

cleanAttribute:: String -> String 
cleanAttribute s = fst $ break (' '==) s -- take everything from inside <tag ...> up untill the first space

checkTokens:: [String] -> IO()
checkTokens [] = errorWithoutStackTrace "No tags found" -- catches empty file
checkTokens (x:xs) = checkTokens' (x:xs)

checkTokens':: [String] -> IO()
checkTokens' [] = putStrLn "Scanned all tags"
checkTokens' (x:xs) = if (x `elem` validTokens || x `elem` validTokens') then  checkTokens' xs 
            else errorWithoutStackTrace $ "Invalid tag: " ++ x   

checkHeader :: [String] -> IO()
checkHeader xs
            | head xs /= "html" || last xs /= "/html" = die "Expected <html> .... <html>" 
            | not ((take 2 (tail xs)) == ["head", "/head"] || (take 4 (tail xs)) == ["head", "title", "/title", "/head"]) = die "Expected <head> </head> or <head> <title>...</title> </head>"
            | otherwise = putStrLn "Checked Header Section"

isolateBody :: [String] -> [String]
isolateBody xs = takeWhile ("/html"/=) $ dropWhile ("body"/=) xs -- gives us <body>, ...., </body> 

checkBodyStructure:: String -> String -> IO ()
checkBodyStructure x y = if (x == "body" && y == "/body") then putStrLn ("<body>...</body> as expected") else die "Expected <body>...</body>"

push:: [String] -> [String] -> [String] -- <tag> will be pushed onto stack and an appropriate </tag> will collapse the stack
push [] [] = []
push stack [] = stack
push stack (x:xs) 
            | x `elem` ignorables = push stack xs --i.e. ignore <hr> and <br> since they can freely appear
            | x `elem` nestables = if  not ((x=="div"||x=="p") && ("p" `elem` stack)) then push (x:stack) xs else errorWithoutStackTrace $ "Invalid nesting for <p> with: " ++x 
            | x `elem` nestables' = if (removeSlash x) == head stack then push (tail stack) xs else errorWithoutStackTrace $ "Invalid nesting with: " ++x --i.e wrong closer found, error         
            | otherwise = errorWithoutStackTrace $ "Invalid Tag: " ++ x

removeSlash:: String -> String --well actually removes first character, but we know it must be a nestable i.e. </tag>
removeSlash (x:xs) = xs