# !/usr/bin/env python # -*- coding: utf-8 -*- ###################################################################### # # (c) Copyright University of Southampton, 2021 # # Copyright in this software belongs to University of Southampton, # Highfield, University Road, Southampton SO17 1BJ # # Created By : Stuart E. Middleton # Created Date : 2021/01/29 # Project : Teaching # ###################################################################### from __future__ import absolute_import, division, print_function, unicode_literals import sys, codecs, json, math, time, warnings, re, logging warnings.simplefilter(action='ignore', category=FutureWarning) import nltk, numpy, scipy, sklearn, sklearn_crfsuite, sklearn_crfsuite.metrics LOG_FORMAT = ('%(levelname) -s %(asctime)s %(message)s') logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) logger.info('logging started') def exec_regex_questions(file_chapter=None): # CHANGE BELOW CODE TO USE REGEX TO LIST ALL QUESTIONS IN THE CHAPTER OF TEXT (task 2) # Input >> www.gutenberg.org sourced plain text file for a chapter of a book # Output >> questions.txt = plain text set of extracted questions. one line per question. # hardcoded output to show exactly what is expected to be serialized # file_chapter = open(file_chapter, "r", encoding="utf8") lines = [] current = "" for line in file_chapter: if line == "\n" or line == "\r\n": lines.append(current) current = "" else: current += line.replace("\n", " ").replace("\r", "") lines = [line.strip() for line in lines] book = "\n".join(lines) setQuestions = set() matches = re.findall(r"[\W]+([^?!\n.;:]*\?)", book, re.MULTILINE) setQuestions = set(matches) # DO NOT CHANGE THE BELOW CODE WHICH WILL SERIALIZE THE ANSWERS FOR THE AUTOMATED TEST HARNESS TO LOAD AND MARK writeHandle = codecs.open('questions.txt', 'w', 'utf-8', errors='replace') for strQuestion in setQuestions: writeHandle.write(strQuestion + '\n') writeHandle.close() if __name__ == '__main__': if len(sys.argv) < 4: raise Exception('missing command line args : ' + repr(sys.argv)) ontonotes_file = sys.argv[1] book_file = sys.argv[2] chapter_file = sys.argv[3] logger.info('ontonotes = ' + repr(ontonotes_file)) logger.info('book = ' + repr(book_file)) logger.info('chapter = ' + repr(chapter_file)) # DO NOT CHANGE THE CODE IN THIS FUNCTION exec_regex_questions(chapter_file)