task1_submission.py · nlp-cw

# !/usr/bin/env python
# -*- coding: utf-8 -*-

######################################################################
#
# (c) Copyright University of Southampton, 2021
#
# Copyright in this software belongs to University of Southampton,
# Highfield, University Road, Southampton SO17 1BJ
#
# Created By : Stuart E. Middleton
# Created Date : 2021/01/29
# Project : Teaching
#
######################################################################

from __future__ import absolute_import, division, print_function, unicode_literals

import sys, codecs, json, math, time, warnings, re, logging
warnings.simplefilter( action='ignore', category=FutureWarning )

import nltk, numpy, scipy, sklearn, sklearn_crfsuite, sklearn_crfsuite.metrics

LOG_FORMAT = ('%(levelname) -s %(asctime)s %(message)s')
logger = logging.getLogger( __name__ )
logging.basicConfig( level=logging.INFO, format=LOG_FORMAT )
logger.info('logging started')

def exec_regex_toc( file_book = None ) :

	# CHANGE BELOW CODE TO USE REGEX TO BUILD A TABLE OF CONTENTS FOR A BOOK (task 1)

	# Input >> www.gutenberg.org sourced plain text file for a whole book
	# Output >> toc.json = { <chapter_number_text> : <chapter_title_text> }
	file_book = open(file_book, "r", encoding="utf8").readlines()

	lines = []
	current = ""
	for line in file_book:
		if line == "\n" or line == "\r\n":
			lines.append(current)
			current = ""
		else:
			current += line.replace("\n", " ").replace("\r", "")

	lines = [line.strip() for line in lines]
	book = "\n".join(lines)
	with open("out.txt", "w", encoding="utf8") as f:
		f.write(book)
	chapters = re.findall(
		r"^((?:(?:CHAPTER|Chapter) ([\w\-]+|[\d\-]+)(?:[\.\:] |\n| )|([XVI]+)[\.\:\n] )((?:\S+ ?)+))$", book,
		re.MULTILINE)
	count = 0
	dictTOC = {}
	books = [[]]
	current = []
	numerals = 0
	regular = 0
	for chapter in chapters:
		if chapter[1]:
			regular += 1
		else:
			numerals += 1

	is_numerals = regular < numerals

	for chapter in chapters:
		if chapter[1] and not is_numerals:
			index, title = (chapter[1].strip(), chapter[3].strip())
		elif chapter[2] and is_numerals:
			index, title = (chapter[2].strip(), chapter[3].strip())
		else:
			continue
		if index not in current:
			current.append(index)
		else:
			count += 1
			books.append([])
			current = []
		books[count].append((index, title))

	if len(books) > 1 and books[0] != books[1]:
		for x, chaps in enumerate(books):
			for index, title in chaps:
				dictTOC[f"(Book {x + 1}) {index}"] = title
	else:
		for index, title in books[0]:
			dictTOC[f"{index}"] = title

	# DO NOT CHANGE THE BELOW CODE WHICH WILL SERIALIZE THE ANSWERS FOR THE AUTOMATED TEST HARNESS TO LOAD AND MARK

	writeHandle = codecs.open( 'toc.json', 'w', 'utf-8', errors = 'replace' )
	strJSON = json.dumps( dictTOC, indent=2 )
	writeHandle.write( strJSON + '\n' )
	writeHandle.close()

if __name__ == '__main__':
	if len(sys.argv) < 4 :
		raise Exception( 'missing command line args : ' + repr(sys.argv) )
	ontonotes_file = sys.argv[1]
	book_file = sys.argv[2]
	chapter_file = sys.argv[3]

	logger.info( 'ontonotes = ' + repr(ontonotes_file) )
	logger.info( 'book = ' + repr(book_file) )
	logger.info( 'chapter = ' + repr(chapter_file) )

	# DO NOT CHANGE THE CODE IN THIS FUNCTION

	exec_regex_toc( book_file )