nlp-cw / task2_submission.py
task2_submission.py
Raw
# !/usr/bin/env python
# -*- coding: utf-8 -*-

######################################################################
#
# (c) Copyright University of Southampton, 2021
#
# Copyright in this software belongs to University of Southampton,
# Highfield, University Road, Southampton SO17 1BJ
#
# Created By : Stuart E. Middleton
# Created Date : 2021/01/29
# Project : Teaching
#
######################################################################

from __future__ import absolute_import, division, print_function, unicode_literals

import sys, codecs, json, math, time, warnings, re, logging

warnings.simplefilter(action='ignore', category=FutureWarning)

import nltk, numpy, scipy, sklearn, sklearn_crfsuite, sklearn_crfsuite.metrics

LOG_FORMAT = ('%(levelname) -s %(asctime)s %(message)s')
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
logger.info('logging started')


def exec_regex_questions(file_chapter=None):
    # CHANGE BELOW CODE TO USE REGEX TO LIST ALL QUESTIONS IN THE CHAPTER OF TEXT (task 2)

    # Input >> www.gutenberg.org sourced plain text file for a chapter of a book
    # Output >> questions.txt = plain text set of extracted questions. one line per question.

    # hardcoded output to show exactly what is expected to be serialized
    #
    file_chapter = open(file_chapter, "r", encoding="utf8")
    lines = []
    current = ""
    for line in file_chapter:
        if line == "\n" or line == "\r\n":
            lines.append(current)
            current = ""
        else:
            current += line.replace("\n", " ").replace("\r", "")

    lines = [line.strip() for line in lines]
    book = "\n".join(lines)

    setQuestions = set()
    matches = re.findall(r"[\W]+([^?!\n.;:]*\?)", book,
                         re.MULTILINE)
    setQuestions = set(matches)
    # DO NOT CHANGE THE BELOW CODE WHICH WILL SERIALIZE THE ANSWERS FOR THE AUTOMATED TEST HARNESS TO LOAD AND MARK

    writeHandle = codecs.open('questions.txt', 'w', 'utf-8', errors='replace')
    for strQuestion in setQuestions:
        writeHandle.write(strQuestion + '\n')
    writeHandle.close()


if __name__ == '__main__':
    if len(sys.argv) < 4:
        raise Exception('missing command line args : ' + repr(sys.argv))
    ontonotes_file = sys.argv[1]
    book_file = sys.argv[2]
    chapter_file = sys.argv[3]

    logger.info('ontonotes = ' + repr(ontonotes_file))
    logger.info('book = ' + repr(book_file))
    logger.info('chapter = ' + repr(chapter_file))

    # DO NOT CHANGE THE CODE IN THIS FUNCTION

    exec_regex_questions(chapter_file)