HTML-Validator / C / validator.c
validator.c
Raw
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

// Initial lengths for dynamic arrays
#define bufferLengh 255
#define initTokenArrayLength 128
#define tokenLength 255 // Potentially rework, have dynamic token lengths 

// Lists of tokens
const char* const nestables[] = {"<h1>", "<h2>", "<h3>", "<p>", "<ul>", "<li>", "<a>", "<div>"};
const char* const nestablesEnding[] = {"</h1>", "</h2>", "</h3>", "</p>", "</ul>", "</li>", "</a>", "</div>"};
// <p> nestables exclude <p> and <div>

const char valid[] = "VALID";

// Globals, having to pass these as parameters cluttered code...
int tokenCount =0;
int head =0;
int currentMaxTokenCount =initTokenArrayLength;
char** tokens;
char** stack;

void failedMemAllocation()
{
    fprintf(stderr, "%s", "Error allocating heap memory!\n");
    exit(EXIT_FAILURE);
}

void lexingError(char* stringPtr)
{
    printf("Error Detected: %s", stringPtr);
    exit(EXIT_FAILURE);
}

void presentError(char errorMessage[])
{   
    printf("Error detected: %s\n", errorMessage);
    exit(EXIT_FAILURE);
}

void parsingSuccessfull()
{
    printf("No errors detected\n");
    exit(EXIT_SUCCESS);
}

void prepTokenArray()
{
    tokens = malloc(initTokenArrayLength * sizeof(char *)); 
    if(tokens) 
    {
        for(int i = 0; i < initTokenArrayLength; i++)
        tokens[i] = malloc(tokenLength* sizeof(char));  
    }
    else { failedMemAllocation();}
}

void expandTokenArray()
{
    char **tokensCopy = realloc(tokens, currentMaxTokenCount*2 * sizeof(char *));
    if (tokensCopy) 
    { 
        tokens = tokensCopy;
        for(int i = 0; i<currentMaxTokenCount;i++)
            tokens[i+currentMaxTokenCount] = malloc(tokenLength*sizeof(char));
    }
    else {failedMemAllocation();}
    currentMaxTokenCount*=2;
}

void putToken(char* tokenPtr)
{
    if (strcmp(tokenPtr, "")!=0) // We dont want to put empty tokens
    {
        // Expand token array 2x when full 
        if (tokenCount >= currentMaxTokenCount)
        {
            expandTokenArray();
        }
        strcpy(tokens[tokenCount], tokenPtr);
        tokenCount++;
    }
}

void tokenize(char* stringPtr)  
{
    char *flag;
    strcpy(flag, " ");

    /*We build this up as we iterate the string.
    Strtok was not suitable, build up tokens char by char */
    char tempToken[tokenLength];
    strcpy(tempToken, ""); // Init current token

    // Catching stuff between <...> and >...< seperately.
    for(int i =0; i<strlen(stringPtr);i++)
    {
        if (stringPtr[i]=='<' )
        {
            if (strcmp(flag, " ")==0)
            {
                putToken(tempToken);
                strcpy(tempToken,"");  // Everything before < a token. 
                strcpy(flag,"<");
                strcat(tempToken, flag);
            }
            else {lexingError(stringPtr);}
            // Catches <...< 
        }
        else if (stringPtr[i]=='>')
        {
            if (strcmp(flag,"<")==0)
            {
                strcat(tempToken, ">");
                strcpy(flag," ");
                putToken(tempToken);
                strcpy(tempToken,"");
            }
            else {lexingError(stringPtr);}
            // Cant have a > unless we saw < already
        }
        else // Manage non angle brackets
        {
            if (strcmp(flag, "<")==0 && isspace(stringPtr[i])) // Cleans out attributes
            {
                do 
                {
                    if (i > strlen(stringPtr)) // Make sure a tag closes with >
                    {
                        lexingError(stringPtr);
                    }
                    i++;
                } while (stringPtr[i]!='>'); //Skips through untill the tag closes

                // Add the tag excluding the attribute 
                strcat(tempToken, ">");
                strcpy(flag," ");
                putToken(tempToken);
                strcpy(tempToken,"");
                
            }
            else if (strcmp(flag, " ")!=0) 
            {   
                strncat(tempToken, &stringPtr[i],1 );
            }
        }
    }
    putToken(tempToken); // Catches non tag lines i.e plain text
}

void loadFile(char* filename)
{
    FILE* filePointer;
    char buffer[bufferLengh];
    filePointer = fopen(filename, "r");
    if (!filePointer) // Check file not found
    {
        printf("\"%s\" does not exist...\n", filename);
        exit(EXIT_FAILURE);
    }
    while(fgets(buffer, bufferLengh, filePointer)) 
    {
        tokenize(buffer);
    }
    fclose(filePointer);
}

int checkmatch(int current, char compareTo[])
{
    if (strcmp(tokens[current], compareTo)==0) {return 1;}
    return 0;
}

int validToken(char token[])
{
    for(int i =0;i<sizeof(nestables)/sizeof(nestables[0]);i++) 
    {
        if (strcmp(token, nestables[i])==0||strcmp(token, nestablesEnding[i])==0) // Found in either 
        {return 1;}
    }
    return 0;
}

void prepareStack()
{
    stack = malloc(tokenCount * sizeof(char *)); 
    if(stack) // Catch null pointer
    {
        for(int i = 0; i < tokenCount; i++)
            stack[i] = malloc(tokenLength* sizeof(char));
    }
    else { failedMemAllocation();}
}

int stackContainsP() // Linear scan of stack, will be false if <p> has been closed already
{
    for (int i=0;i<head;i++)
        if(strcmp("<p>", stack[i])==0)
            return 1;
    return 0;
}

void push(char token[])
{
    if (token[1] != '/') // Closing tags, second character always '/'
    {
        stack[head] = token;
        head++;
    }
    else //Collapses the stack whenever a valid closing tag is matched. 
    {
        head--;
        // Generate closer for the top of stack, i.e. <div> -> </div>
        char closer[tokenLength];
        strcpy(closer, "</");
        for (int i =1; i<strlen(stack[head]);i++)
        {
            strncat(closer,&stack[head][i],1);
        }

        if (!(strcmp(closer, token)==0)) {presentError("Invalid nesting found");}
        // i.e only </tag> pushes onto <tag> (bad nesting)
    }
}

void checkBody(int current)
{
    for (current = current+1;current<tokenCount-2;current++) // Checking between <body> and </body>
    {
        if (!(checkmatch(current, "<br>")|| checkmatch(current, "<hr>"))) // <hr> and <br> don't have a closer
        {
            if (!validToken(tokens[current])) {presentError("Invalid token found in body");}
            else if ( (checkmatch(current, "<div>")|| checkmatch(current, "<p>"))
                    && stackContainsP()) {presentError("Tried to nest invalid tags within <p>");} // <p> cannot contain <div> or <p>
            push(tokens[current]);
        }
    }
    if(head==0) {parsingSuccessfull();}
     // I.e. Stack is empty, each <tag> had properly nested </tag>
    else {presentError("Unclosed tags found");}
}

void checkTokens()
{
    prepareStack();
    // Checking the opening of file is bit tedious
    int current = 0;
    if (!(checkmatch(current, "<html>") 
        && checkmatch(tokenCount-1, "</html>")))
    {
        presentError("Expected <html>...</html>");  
    }
    current++; 
    if (checkmatch(current, "<head>")
        && checkmatch(current+1, "</head>"))
    {
        current+=2;
    }
    else if (checkmatch(current, "<head>")
        && checkmatch(current+1, "<title>")
        && checkmatch(current+2, "</title>")
        && checkmatch(current+3, "</head>"))
    {
        current+=4;
    }
    else {presentError("Expected <head> </head> or <head> <title>...</title> </head>");}
    if (checkmatch(current, "<body>")
        && checkmatch(current+1, "</body>")
        && tokenCount ==6) 
    {
        parsingSuccessfull(); 
        /*  Base cases: <html> <head> <\head> <body> </body> </html>
         <html> <head> <title> </title> <\head> <body> </body> </html> */
    } 
    else if (checkmatch(current, "<body>")
        && checkmatch(tokenCount-2, "</body>")) 
    {
        /* Only occurs if above conditions met and content exists between <body> ... </body> */  
        checkBody(current);
    }
    else {presentError("Expected <body> ... </body> after <head>...</head> section");}
}

int main(int argc, char *argv[])
{
    prepTokenArray();
    if (argc==1) {loadFile("file.html");} // Open file.html by default
    else if (argc == 2) {loadFile(argv[1]);}
    else
    {
        printf("Too many command line args...");
        exit(EXIT_FAILURE);
    }
    checkTokens();
}