#include <stdio.h> #include <stdlib.h> #include <ctype.h> #include <string.h> // Initial lengths for dynamic arrays #define bufferLengh 255 #define initTokenArrayLength 128 #define tokenLength 255 // Potentially rework, have dynamic token lengths // Lists of tokens const char* const nestables[] = {"<h1>", "<h2>", "<h3>", "<p>", "<ul>", "<li>", "<a>", "<div>"}; const char* const nestablesEnding[] = {"</h1>", "</h2>", "</h3>", "</p>", "</ul>", "</li>", "</a>", "</div>"}; // <p> nestables exclude <p> and <div> const char valid[] = "VALID"; // Globals, having to pass these as parameters cluttered code... int tokenCount =0; int head =0; int currentMaxTokenCount =initTokenArrayLength; char** tokens; char** stack; void failedMemAllocation() { fprintf(stderr, "%s", "Error allocating heap memory!\n"); exit(EXIT_FAILURE); } void lexingError(char* stringPtr) { printf("Error Detected: %s", stringPtr); exit(EXIT_FAILURE); } void presentError(char errorMessage[]) { printf("Error detected: %s\n", errorMessage); exit(EXIT_FAILURE); } void parsingSuccessfull() { printf("No errors detected\n"); exit(EXIT_SUCCESS); } void prepTokenArray() { tokens = malloc(initTokenArrayLength * sizeof(char *)); if(tokens) { for(int i = 0; i < initTokenArrayLength; i++) tokens[i] = malloc(tokenLength* sizeof(char)); } else { failedMemAllocation();} } void expandTokenArray() { char **tokensCopy = realloc(tokens, currentMaxTokenCount*2 * sizeof(char *)); if (tokensCopy) { tokens = tokensCopy; for(int i = 0; i<currentMaxTokenCount;i++) tokens[i+currentMaxTokenCount] = malloc(tokenLength*sizeof(char)); } else {failedMemAllocation();} currentMaxTokenCount*=2; } void putToken(char* tokenPtr) { if (strcmp(tokenPtr, "")!=0) // We dont want to put empty tokens { // Expand token array 2x when full if (tokenCount >= currentMaxTokenCount) { expandTokenArray(); } strcpy(tokens[tokenCount], tokenPtr); tokenCount++; } } void tokenize(char* stringPtr) { char *flag; strcpy(flag, " "); /*We build this up as we iterate the string. Strtok was not suitable, build up tokens char by char */ char tempToken[tokenLength]; strcpy(tempToken, ""); // Init current token // Catching stuff between <...> and >...< seperately. for(int i =0; i<strlen(stringPtr);i++) { if (stringPtr[i]=='<' ) { if (strcmp(flag, " ")==0) { putToken(tempToken); strcpy(tempToken,""); // Everything before < a token. strcpy(flag,"<"); strcat(tempToken, flag); } else {lexingError(stringPtr);} // Catches <...< } else if (stringPtr[i]=='>') { if (strcmp(flag,"<")==0) { strcat(tempToken, ">"); strcpy(flag," "); putToken(tempToken); strcpy(tempToken,""); } else {lexingError(stringPtr);} // Cant have a > unless we saw < already } else // Manage non angle brackets { if (strcmp(flag, "<")==0 && isspace(stringPtr[i])) // Cleans out attributes { do { if (i > strlen(stringPtr)) // Make sure a tag closes with > { lexingError(stringPtr); } i++; } while (stringPtr[i]!='>'); //Skips through untill the tag closes // Add the tag excluding the attribute strcat(tempToken, ">"); strcpy(flag," "); putToken(tempToken); strcpy(tempToken,""); } else if (strcmp(flag, " ")!=0) { strncat(tempToken, &stringPtr[i],1 ); } } } putToken(tempToken); // Catches non tag lines i.e plain text } void loadFile(char* filename) { FILE* filePointer; char buffer[bufferLengh]; filePointer = fopen(filename, "r"); if (!filePointer) // Check file not found { printf("\"%s\" does not exist...\n", filename); exit(EXIT_FAILURE); } while(fgets(buffer, bufferLengh, filePointer)) { tokenize(buffer); } fclose(filePointer); } int checkmatch(int current, char compareTo[]) { if (strcmp(tokens[current], compareTo)==0) {return 1;} return 0; } int validToken(char token[]) { for(int i =0;i<sizeof(nestables)/sizeof(nestables[0]);i++) { if (strcmp(token, nestables[i])==0||strcmp(token, nestablesEnding[i])==0) // Found in either {return 1;} } return 0; } void prepareStack() { stack = malloc(tokenCount * sizeof(char *)); if(stack) // Catch null pointer { for(int i = 0; i < tokenCount; i++) stack[i] = malloc(tokenLength* sizeof(char)); } else { failedMemAllocation();} } int stackContainsP() // Linear scan of stack, will be false if <p> has been closed already { for (int i=0;i<head;i++) if(strcmp("<p>", stack[i])==0) return 1; return 0; } void push(char token[]) { if (token[1] != '/') // Closing tags, second character always '/' { stack[head] = token; head++; } else //Collapses the stack whenever a valid closing tag is matched. { head--; // Generate closer for the top of stack, i.e. <div> -> </div> char closer[tokenLength]; strcpy(closer, "</"); for (int i =1; i<strlen(stack[head]);i++) { strncat(closer,&stack[head][i],1); } if (!(strcmp(closer, token)==0)) {presentError("Invalid nesting found");} // i.e only </tag> pushes onto <tag> (bad nesting) } } void checkBody(int current) { for (current = current+1;current<tokenCount-2;current++) // Checking between <body> and </body> { if (!(checkmatch(current, "<br>")|| checkmatch(current, "<hr>"))) // <hr> and <br> don't have a closer { if (!validToken(tokens[current])) {presentError("Invalid token found in body");} else if ( (checkmatch(current, "<div>")|| checkmatch(current, "<p>")) && stackContainsP()) {presentError("Tried to nest invalid tags within <p>");} // <p> cannot contain <div> or <p> push(tokens[current]); } } if(head==0) {parsingSuccessfull();} // I.e. Stack is empty, each <tag> had properly nested </tag> else {presentError("Unclosed tags found");} } void checkTokens() { prepareStack(); // Checking the opening of file is bit tedious int current = 0; if (!(checkmatch(current, "<html>") && checkmatch(tokenCount-1, "</html>"))) { presentError("Expected <html>...</html>"); } current++; if (checkmatch(current, "<head>") && checkmatch(current+1, "</head>")) { current+=2; } else if (checkmatch(current, "<head>") && checkmatch(current+1, "<title>") && checkmatch(current+2, "</title>") && checkmatch(current+3, "</head>")) { current+=4; } else {presentError("Expected <head> </head> or <head> <title>...</title> </head>");} if (checkmatch(current, "<body>") && checkmatch(current+1, "</body>") && tokenCount ==6) { parsingSuccessfull(); /* Base cases: <html> <head> <\head> <body> </body> </html> <html> <head> <title> </title> <\head> <body> </body> </html> */ } else if (checkmatch(current, "<body>") && checkmatch(tokenCount-2, "</body>")) { /* Only occurs if above conditions met and content exists between <body> ... </body> */ checkBody(current); } else {presentError("Expected <body> ... </body> after <head>...</head> section");} } int main(int argc, char *argv[]) { prepTokenArray(); if (argc==1) {loadFile("file.html");} // Open file.html by default else if (argc == 2) {loadFile(argv[1]);} else { printf("Too many command line args..."); exit(EXIT_FAILURE); } checkTokens(); }