import re from more_itertools import flatten ## ATTENTION ### # the following heuristics are very basic and just a low effort approach a quite complex problem. The general idea is to # add possible candidates for Values in questions which and not get delivered by a standard NER-system. # This values will then be fed into the database-value-finder which will boil them down to the ones really found in the database. # A more powerful idea would be to train a stochastic generative model with the task of generating value candidates based on some input. def find_values_in_quote(question): """ We try find all values in quotas, as this are almost always values we need later. We support a variety of different quota utf-8 characters. """ matches = re.findall(r"\s[\"'‘“’](.+?)[\"'’”]", question) return [m for m in matches] def find_ordinals(question_tokens): ordinals = { 'once': 1, 'twice': 2, 'thrice': 3, 'single': 1, 'double': 2, 'triple': 3, 'first': 1, 'second': 2, 'third': 3, 'fourth': 4, 'fifth': 5, 'sixth': 6, 'seventh': 7, 'eighth': 8, 'ninth': 9, 'tenth': 9, } values_from_ordinals = [] tokens = _sub_tokenize_tokens(question_tokens) for token in tokens: if token in ordinals: values_from_ordinals.append(str(ordinals[token])) return values_from_ordinals def find_emails(question): matches = re.findall(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", question) return [m for m in matches] def find_genders(question_tokens): gender_abbrevations = { 'female': ['F', 'female'], 'females': ['F', 'female'], 'girl': ['F', 'female'], 'girls': ['F', 'female'], 'male': ['M', 'male'], 'males': ['M', 'male'], 'boy': ['M', 'male'], 'boys': ['M', 'male'] } gender_values = [] tokens = _sub_tokenize_tokens(question_tokens) for token in tokens: if token in gender_abbrevations: gender_values.extend(gender_abbrevations[token]) return gender_values def find_null_empty_values(question_tokens): null_empty_mentionings = { 'NULL': 'null', 'null': 'null', 'empty': '' } null_empty_values = [] tokens = _sub_tokenize_tokens(question_tokens) for token in tokens: if token in null_empty_mentionings: null_empty_values.append(null_empty_mentionings[token]) return null_empty_values def find_variety_of_common_mentionings(question_tokens): common_mentionings = { 'spring': 'spring', 'fall': 'fall', 'summer': 'summer', 'winter': 'winter', 'morning': 'morning', 'evening': 'evening', 'night': 'night', 'day': 'day', 'yes': 'yes', 'no': 'no' } common_values = [] tokens = _sub_tokenize_tokens(question_tokens) for token in tokens: if token in common_mentionings: common_values.append(common_mentionings[token]) return common_values def find_special_codes(question): """ Special codes refer to model numbers, classes, rooms, etc. Example: "What is the first name of the professor who is teaching CIS-220 and QM-261?" """ matches = re.findall(r"[A-Z-/0-9]{2,}", question) # Attention: the space in the start of the regex ist not a mistake, but necessary to avoid apostrophes in word (e.g. Jean d'Arc is 'french') return [m for m in matches if not m.isnumeric()] def find_single_letters(question): """ if the word "letter" (and plural "letters") is mentioned, return all single letters """ if re.findall(r"\bletter\b|\bletters\b", question): matches = re.findall(r"\b[A-Za-z]\b", question) return [m for m in matches] return [] def find_capitalized_words(question): """ Often capitalized words are an indicator for a value. Capitalized words can also appear consecutive. Simple example: "What are the names and addressed of customers who have both New and Pending orders?" --> 'New' and 'Pending' Consecutive example: "What is id of the staff who had a Staff Department Assignment earlier than any Clerical Staff?" --> 'Staff Department Assignment' and 'Clerical Staff' """ all_capitalized_words = [] # english sentences normally start with an upper case - but it could also be that the first word is already a special word. # We therefore check if the first letter is uppercase and the second one is lowercase. If that's the case, we simply throw away the first letter to avoid confusion. if question[0].isupper and question[1].islower(): question = question[1:] # with re.finditer() the group() property is referring to the full match. The next elements are the groups. Have a look at regex101.com to get the regex. consecutive_capitalized_words = [match.group() for match in re.finditer(r"(\b[A-Z0-9][A-Za-z0-9-/]+\b\s)+\b[A-Z0-9][A-Za-z0-9-/]+", question)] all_capitalized_words.extend(consecutive_capitalized_words) single_capitalized_word = [match.group() for match in re.finditer(r"\b[A-Z0-9][A-Za-z0-9-/]+\b", question)] for capitalized_word in single_capitalized_word: # make sure the capitalized word is not already part of consecutive_capitalized_words. if next(filter(lambda w: capitalized_word in w, consecutive_capitalized_words), None) is None: # don't add simple numbers - they get handled by other heuristics if not capitalized_word.isnumeric(): all_capitalized_words.append(capitalized_word) return all_capitalized_words def find_months(question_tokens): months = { 'january': '1/', 'february': '2/', 'march': '3/', 'april': '4/', 'may': '5/', 'june': '6/', 'july': '7/', 'august': '8/', 'september': '9/', 'october': '10/', 'november': '11/', 'december': '12/' } months_fuzzy_dates = [] tokens = _sub_tokenize_tokens(question_tokens) for token in tokens: if token in months: months_fuzzy_dates.append(months[token]) return months_fuzzy_dates def find_location_abbreviations(question_tokens, question): """ This heuristic is just a very basic approximation for a much complexer problem. Location names are very divers and require a powerful model to understand them properly. """ country_name_abbrevations_US = [ 'USA', 'US', 'United States', 'United States of America' ] country_name_abbrevations_UK = [ 'UK', 'United Kingdom', 'England' ] location_abbvreviations_US = { 'AK': ['Alaska'], 'AL': ['Alabama'], 'AR': ['Arkansas'], 'AZ': ['Arizona'], 'CA': ['California'], 'CO': ['Colorado'], 'CT': ['Connecticut'], 'DE': ['Delaware'], 'FL': ['Florida'], 'GA': ['Georgia'], 'HI': ['Hawaii'], 'IA': ['Iowa'], 'ID': ['Idaho'], 'IL': ['Illinois'], 'IN': ['Indiana'], 'KS': ['Kansas'], 'KY': ['Kentucky'], 'LA': ['Louisiana', 'Los Angeles'], 'MA': ['Massachusetts'], 'MD': ['Maryland'], 'ME': ['Maine'], 'MI': ['Michigan'], 'MN': ['Minnesota'], 'MO': ['Missouri'], 'MS': ['Mississippi'], 'MT': ['Montana'], 'NC': ['North Carolin'], 'ND': ['North Dakota'], 'NE': ['Nebraska'], 'NH': ['New Hampshire'], 'NJ': ['New Jersey'], 'NM': ['New Mexico'], 'NV': ['Nevada'], 'NY': ['New York'], 'OH': ['Ohio'], 'OK': ['Oklahoma'], 'OR': ['Oregon'], 'PA': ['Pennsylvania'], 'RI': ['Rhode Island'], 'SC': ['South Carolin'], 'SD': ['South Dakota'], 'TN': ['Tennessee'], 'TX': ['Texas'], 'UT': ['Utah'], 'VA': ['Virginia'], 'VT': ['Vermont'], 'WA': ['Washington'], 'WI': ['Wisconsin'], 'WV': ['West Virginia'], 'WY': ['Wyoming'] } location_candidates = [] for key, potential_values in location_abbvreviations_US.items(): add_me = False if key in question_tokens: add_me = True for sub_value in potential_values: if sub_value in question_tokens: add_me = True if add_me: location_candidates.append(key) location_candidates.extend(potential_values) for abbreviation in country_name_abbrevations_US: if abbreviation in question: # we don't know how to look for USA - therefore add all options. The database finder should sort them out. location_candidates.extend(country_name_abbrevations_US) for abbreviation in country_name_abbrevations_UK: if abbreviation in question: # we don't know how to look for United Kingdom - therefore add all options. The database finder should sort them out. location_candidates.extend(country_name_abbrevations_UK) return location_candidates def _sub_tokenize_tokens(tokens): """ There are some combined tokens we need to further tokenize (example: "fourth-grade") """ return flatten(map(lambda t: t.split('-'), tokens))