dorkai
/

codeX-1.0

Text Generation

code-generation

Model card Files Files and versions Community

codeX-1.0 / evaluator /CodeBLEU /parser /utils.py

dorkai's picture

Upload model from GitHub.

b410583 about 2 years ago

history blame contribute delete

3.75 kB

	# Copyright (c) Microsoft Corporation.
	# Licensed under the MIT license.

	import re
	from io import StringIO
	import tokenize


	def remove_comments_and_docstrings(source, lang):
	if lang in ['python']:
	"""
	Returns 'source' minus comments and docstrings.
	"""
	io_obj = StringIO(source)
	out = ""
	prev_toktype = tokenize.INDENT
	last_lineno = -1
	last_col = 0
	for tok in tokenize.generate_tokens(io_obj.readline):
	token_type = tok[0]
	token_string = tok[1]
	start_line, start_col = tok[2]
	end_line, end_col = tok[3]
	ltext = tok[4]
	if start_line > last_lineno:
	last_col = 0
	if start_col > last_col:
	out += (" " * (start_col - last_col))
	# Remove comments:
	if token_type == tokenize.COMMENT:
	pass
	# This series of conditionals removes docstrings:
	elif token_type == tokenize.STRING:
	if prev_toktype != tokenize.INDENT:
	# This is likely a docstring; double-check we're not inside an operator:
	if prev_toktype != tokenize.NEWLINE:
	if start_col > 0:
	out += token_string
	else:
	out += token_string
	prev_toktype = token_type
	last_col = end_col
	last_lineno = end_line
	temp = []
	for x in out.split('\n'):
	if x.strip() != "":
	temp.append(x)
	return '\n'.join(temp)
	elif lang in ['ruby']:
	return source
	else:
	def replacer(match):
	s = match.group(0)
	if s.startswith('/'):
	return " " # note: a space and not an empty string
	else:
	return s

	pattern = re.compile(
	r'//.?$\|/\.?\/\|\'(?:\\.\|[^\\\'])\'\|"(?:\\.\|[^\\"])"',
	re.DOTALL \| re.MULTILINE
	)
	temp = []
	for x in re.sub(pattern, replacer, source).split('\n'):
	if x.strip() != "":
	temp.append(x)
	return '\n'.join(temp)


	def tree_to_token_index(root_node):
	if (len(root_node.children) == 0 or root_node.type in ['string_literal', 'string',
	'character_literal']) and root_node.type != 'comment':
	return [(root_node.start_point, root_node.end_point)]
	else:
	code_tokens = []
	for child in root_node.children:
	code_tokens += tree_to_token_index(child)
	return code_tokens


	def tree_to_variable_index(root_node, index_to_code):
	if (len(root_node.children) == 0 or root_node.type in ['string_literal', 'string',
	'character_literal']) and root_node.type != 'comment':
	index = (root_node.start_point, root_node.end_point)
	_, code = index_to_code[index]
	if root_node.type != code:
	return [(root_node.start_point, root_node.end_point)]
	else:
	return []
	else:
	code_tokens = []
	for child in root_node.children:
	code_tokens += tree_to_variable_index(child, index_to_code)
	return code_tokens


	def index_to_code_token(index, code):
	start_point = index[0]
	end_point = index[1]
	if start_point[0] == end_point[0]:
	s = code[start_point[0]][start_point[1]:end_point[1]]
	else:
	s = ""
	s += code[start_point[0]][start_point[1]:]
	for i in range(start_point[0] + 1, end_point[0]):
	s += code[i]
	s += code[end_point[0]][:end_point[1]]
	return s