# -*- coding: utf-8 -*- """ Created on Mon Dec 7 19:35:09 2020 @author: DrLC """ from tree_sitter import Language, Parser import json import tqdm def is_fp(num): if num.lower() in ["inf", "nan"]: return False try: float(num) return True except: return False def normalize(seq): norm = [] for t in seq: if "'" in t: norm.append("") elif '"' in t: norm.append("") elif t.isdigit() or t[:2] == "0x": norm.append("") elif is_fp(t): norm.append("") else: norm.append(t) return norm def tokenize_java(src, so_path='../data/java-language.so'): parser = getattr(tokenize_java, 'parser', None) if parser is None: JAVA_LANGUAGE = Language(so_path, 'java') tokenize_java.parser = Parser() tokenize_java.parser.set_language(JAVA_LANGUAGE) parser = getattr(tokenize_java, 'parser', None) byte_seq = bytes(src, encoding='utf-8') tree = parser.parse(byte_seq) tokens = [] def get_tokenseq(node): if node.type != 'comment' and len(node.children) <= 0: tokens.append(byte_seq[node.start_byte: node.end_byte]) for c in node.children: get_tokenseq(c) get_tokenseq(tree.root_node) tokens = [str(t, 'utf-8') for t in tokens] return tokens if __name__ == "__main__": data_path = "../data/bcb_data.jsonl" so_path = '../data/java-language.so' tgt_path = "../data/bcb_norm.jsonl" with open(data_path, "rb") as f: d = {} for l in tqdm.tqdm(f.readlines()): r = json.loads(l) d[r['idx']] = " ".join(normalize([t.strip() for t in tokenize_java(r['func'], so_path)])) with open(tgt_path, "w") as f: for i in d.items(): f.write(json.dumps({'func': i[1], 'idx': i[0]}) + '\n')