from unittest import TestCase from named_entity_recognition.handcrafted_heuristics import find_values_in_quote, find_ordinals, \ find_emails, find_genders, find_null_empty_values, find_variety_of_common_mentionings, find_special_codes, \ find_single_letters, find_capitalized_words, find_months, find_location_abbreviations class Test(TestCase): def test__find_values_in_quota(self): # GIVEN question = "Find the names of the customers who have order status both 'On Road' and \"Shipped\"" # WHEN values = find_values_in_quote(question) # THEN self.assertEqual(['On Road', 'Shipped'], values) def test__find_values_in_quota_apostroph_in_names(self): # GIVEN question = "Which head's name has the substring 'Ha'? List the id and name." # WHEN values = find_values_in_quote(question) # THEN self.assertEqual(['Ha'], values) def test__find_values_in_quota__another_weird_apostroph(self): # GIVEN question = "display the employee number and name( first name and last name ) for all employees who work in a department with any employee whose name contains a ’T’." # WHEN values = find_values_in_quote(question) # THEN self.assertEqual(['T'], values) def test__find_ordinals(self): # GIVEN question = ['how', 'many', 'third', 'head', 'of', 'department', 'are', 'older', 'than', '56', '?'] # WHEN ordinals = find_ordinals(question) # THEN self.assertEqual(['3'], ordinals) def test__find_ordinals_combined_tokens(self): # GIVEN question = ["Report", "the", "total", "number", "of", "students", "for", "each", "fourth-grade", "classroom", "."] # WHEN ordinals = find_ordinals(question) # THEN self.assertEqual(['4'], ordinals) def test__find_emails(self): # GIVEN question = 'Find id of the candidate whose email is stanley.monahan@example.org?' # WHEN ordinals = find_emails(question) # THEN self.assertEqual(['stanley.monahan@example.org'], ordinals) def test__find_genders(self): # GIVEN question = ["how", "many", "female", "people", "are", "older", "than", "30", "in", "our", "record", "?"] # WHEN gender_values = find_genders(question) # THEN self.assertEqual(['F', 'female'], gender_values) def test__find_genders_plural(self): # GIVEN question = [ "show", "name", "for", "all", "female", "from", "canada", "having", "a", "wedding", "in", "year", "year", "." ] # WHEN gender_values = find_genders(question) # THEN self.assertEqual(['F', 'female'], gender_values) def test__find_genders_male_plural(self): # GIVEN question = [ "find", "average", "height", "and", "weight", "for", "all", "male", "(", "sex", "is", "m", ")", "." ] # WHEN gender_values = find_genders(question) # THEN self.assertEqual(['M', 'male'], gender_values) def test__find_genders_girl(self): # GIVEN question = [ "how", "many", "girl", "student", "who", "are", "younger", "than", "25", "?" ] # WHEN gender_values = find_genders(question) # THEN self.assertEqual(['F', 'female'], gender_values) def test__find_null_empty_values(self): # GIVEN question = [ "find", "name", "of", "all", "reviewer", "who", "have", "rating", "with", "a", "null", "value", "for", "date", "." ] # WHEN null_empty_values = find_null_empty_values(question) # THEN self.assertEqual(['null'], null_empty_values) def test__find_null_empty_values_empty_value(self): # GIVEN question = [ "what", "are", "first", "name", "and", "last", "name", "of", "player", "whose", "death", "record", "is", "empty", "?" ] # WHEN null_empty_values = find_null_empty_values(question) # THEN self.assertEqual([''], null_empty_values) def test__find_variaty_of_common_mentionings(self): # GIVEN # in opposite to the other tests not a real sample - just to make sure we get all values. question = [ "what", "are", "first", "spring", "and", "fall", "summer", "winter", "player", "whose", "morning", "evening", "is", "night", "day", "?", "yes", "no", ] # WHEN common_mentionings = find_variety_of_common_mentionings(question) # THEN self.assertEqual(['spring', 'fall', 'summer', 'winter', 'morning', 'evening', 'night', 'day', 'yes', 'no'], common_mentionings) def test__find_special_codes(self): # GIVEN question1 = "What is the first name of the professor who is teaching CIS-220 and QM-261?" question2 = "What are the codes of all the courses that are located in room KLR209?" question3 = "How many departments are in the division AS?" # WHEN special_codes1 = find_special_codes(question1) special_codes2 = find_special_codes(question2) special_codes3 = find_special_codes(question3) # THEN self.assertEqual(['CIS-220', 'QM-261'], special_codes1) self.assertEqual(['KLR209'], special_codes2) self.assertEqual(['AS'], special_codes3) def test__find_special_codes__avoid_simple_numbers(self): # GIVEN question1 = "What are the names of stations that have latitude lower than 37.5?" # WHEN special_codes1 = find_special_codes(question1) # THEN self.assertEqual([], special_codes1) def test__find_single_letters(self): # GIVEN question1 = "when is the hire date for those employees whose first name does not containing the letter M?" question2 = "Display all the information for all employees who have the letters D or S in their first name and also arrange the result in descending order by salary." question3 = "What are the descriptions of the categories that products with product descriptions that contain the letter t are in?" question4 = "A sentence without the special words has no matches." # WHEN special_codes1 = find_single_letters(question1) special_codes2 = find_single_letters(question2) special_codes3 = find_single_letters(question3) special_codes4 = find_single_letters(question4) # THEN self.assertEqual(['M'], special_codes1) self.assertEqual(['D', 'S'], special_codes2) self.assertEqual(['t'], special_codes3) self.assertEqual([], special_codes4) def test__find_capitalized_words(self): # GIVEN question1 = "For grants that have descriptions of Regular and Initial Applications, what are their start dates?" question2 = "Find the department name and room of the course INTRODUCTION TO COMPUTER SCIENCE." question3 = "What are the names and addressed of customers who have both New and Pending orders?" question4 = "Show the shipping charge and customer id for customer orders with order status Cancelled or Paid." question5 = "Find the names of departments that are either in division AS or in division EN and in Building NEB." question6 = "Return the distinct name of customers whose order status is Pending, in the order of customer id." question7 = "What is id of the staff who had a Staff Department Assignment earlier than any Clerical Staff?" question8 = "List the order id, customer id for orders in Cancelled status, ordered by their order dates." question9 = "What is the student capacity and type of gender for the dorm whose name as the phrase Donor in it?" question10 = "Do you have a friend living in New York City?" question11 = "Show all flight numbers with aircraft Airbus A340-300." question12 = "Show names for all employees who have certificates on both Boeing 737-800 and Airbus A340-300" question13 = "What are the names of stations that have latitude lower than 37.5?" question14 = "Have Peter Mertens and Dina Barbian written a paper together ?" question15 = "how many papers does David M. Blei have at AISTATS" # WHEN capitalized_words1 = find_capitalized_words(question1) capitalized_words2 = find_capitalized_words(question2) capitalized_words3 = find_capitalized_words(question3) capitalized_words4 = find_capitalized_words(question4) capitalized_words5 = find_capitalized_words(question5) capitalized_words6 = find_capitalized_words(question6) capitalized_words7 = find_capitalized_words(question7) capitalized_words8 = find_capitalized_words(question8) capitalized_words9 = find_capitalized_words(question9) capitalized_words10 = find_capitalized_words(question10) capitalized_words11 = find_capitalized_words(question11) capitalized_words12 = find_capitalized_words(question12) capitalized_words13 = find_capitalized_words(question13) capitalized_words14 = find_capitalized_words(question14) capitalized_words15 = find_capitalized_words(question15) # THEN self.assertEqual(['Initial Applications', 'Regular'], capitalized_words1) self.assertEqual(['INTRODUCTION TO COMPUTER SCIENCE'], capitalized_words2) self.assertEqual(['New', 'Pending'], capitalized_words3) self.assertEqual(['Cancelled', 'Paid'], capitalized_words4) self.assertEqual(['Building NEB', 'AS', 'EN'], capitalized_words5) self.assertEqual(['Pending'], capitalized_words6) self.assertEqual(['Staff Department Assignment', 'Clerical Staff'], capitalized_words7) self.assertEqual(['Cancelled'], capitalized_words8) self.assertEqual(['Donor'], capitalized_words9) self.assertEqual(['New York City'], capitalized_words10) self.assertEqual(['Airbus A340-300'], capitalized_words11) self.assertEqual(['Boeing 737-800', 'Airbus A340-300'], capitalized_words12) self.assertEqual([], capitalized_words13) # a plain number is not a capitalized word! self.assertEqual(['Peter Mertens', 'Dina Barbian'], capitalized_words14) self.assertEqual(['David M. Blei', 'AISTATS'], capitalized_words15) def test__find_location_abbreviations(self): # GIVEN question1 = ['What', 'are', 'the', 'departure', 'and', 'arrival', 'dates', 'of', 'all', 'flights', 'from', 'LA', 'to', 'Honolulu', '?'] question2 = ['List', 'the', 'number', 'of', 'invoices', 'from', 'the', 'US', ',', 'grouped', 'by', 'state', '.'] question3 = ['List', 'the', 'number', 'of', 'invoices', 'and', 'the', 'invoice', 'total', 'from', 'California', '.'] question4 = ['What', 'are', 'the', 'companies', 'and', 'main', 'industries', 'of', 'all', 'companies', 'that', 'are', 'not', 'headquartered', 'in', 'the', 'United', 'States', '?'] question5 = ['What', 'are', 'the', 'names', 'of', 'the', 'ships', 'that', 'are', 'from', 'either', 'the', 'US', 'or', 'the', 'UK', '?'] question6 = ['What', 'is', 'average', 'number', 'of', 'students', 'enrolled', 'in', 'Florida', 'colleges', '?'] question7 = ['What', 'are', 'the', 'names', 'and', 'enrollment', 'numbers', 'for', 'colleges', 'that', 'have', 'more', 'than', '10000', 'enrolled', 'and', 'are', 'located', 'in', 'Louisiana', '?'] question8 = ['How', 'many', 'are', 'the', 'projects', 'started', 'in', 'Caserta', 'in', 'the', 'QUANTUM', 'ENGINEERING', 'DEPARTMENT', 'started', 'after', 'the', '2015', '?'] # WHEN location_abbreviations1 = find_location_abbreviations(question1, ' '.join(question1)) location_abbreviations2 = find_location_abbreviations(question2, ' '.join(question2)) location_abbreviations3 = find_location_abbreviations(question3, ' '.join(question3)) location_abbreviations4 = find_location_abbreviations(question4, ' '.join(question4)) location_abbreviations5 = find_location_abbreviations(question5, ' '.join(question5)) location_abbreviations6 = find_location_abbreviations(question6, ' '.join(question6)) location_abbreviations7 = find_location_abbreviations(question7, ' '.join(question7)) location_abbreviations8 = find_location_abbreviations(question8, ' '.join(question8)) # THEN self.assertEqual(['LA', 'Louisiana', 'Los Angeles'], location_abbreviations1) self.assertEqual(['USA', 'US', 'United States', 'United States of America'], location_abbreviations2) self.assertEqual(['CA', 'California'], location_abbreviations3) self.assertEqual(['USA', 'US', 'United States', 'United States of America'], location_abbreviations4) self.assertEqual(['USA', 'US', 'United States', 'United States of America', 'UK', 'United Kingdom', 'England'], location_abbreviations5) self.assertEqual(['FL', 'Florida'], location_abbreviations6) self.assertEqual(['LA', 'Louisiana', 'Los Angeles'], location_abbreviations7) self.assertEqual([], location_abbreviations8) def test__find_months(self): # GIVEN question = [ "for", "each", "zip", "code", ",", "return", "average", "mean", "temperature", "of", "august", "there", "." ] # WHEN months_fuzzy = find_months(question) # THEN self.assertEqual(['8/'], months_fuzzy)