ValueNet4SPARQL / src / named_entity_recognition / api_ner / extract_values.py
extract_values.py
Raw
import argparse
import json
import os

from named_entity_recognition.api_ner.google_api_repository import remote_named_entity_recognition

if __name__ == '__main__':
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('--data_path', type=str, required=True)
    arg_parser.add_argument('--output_path', type=str, required=True)
    arg_parser.add_argument('--ner_api_secret', type=str, required=True)

    args = arg_parser.parse_args()

    with open(os.path.join(args.data_path), 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)

    error_count = 0
    ner_data = []
    for doc in data:
        extracted_values = remote_named_entity_recognition(doc['question'], args.ner_api_secret)
        if extracted_values:
            ner_data.append({
                'entities': extracted_values['entities'],
                'language': extracted_values['language'],
                'question': doc['question']
            })
        else:
            error_count += 1

    with open(os.path.join(args.output_path), 'w', encoding='utf-8') as f:
        json.dump(ner_data, f, indent=2)

    print("Extracted {} values. {} requests failed.".format(len(data), error_count))