89 lines
2.5 KiB
Python
89 lines
2.5 KiB
Python
import sys
|
|
import pickle
|
|
import re
|
|
from sklearn.cluster import KMeans
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.preprocessing import StandardScaler
|
|
import pandas as pd
|
|
|
|
|
|
# remove comments
|
|
# this function was taken from:
|
|
# https://stackoverflow.com/questions/241327/remove-c-and-c-comments-using-python
|
|
def comment_remover(text):
|
|
def replacer(match):
|
|
s = match.group(0)
|
|
if s.startswith('/'):
|
|
return " " # note: a space and not an empty string
|
|
else:
|
|
return s
|
|
pattern = re.compile(
|
|
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
|
|
re.DOTALL | re.MULTILINE
|
|
)
|
|
return re.sub(pattern, replacer, text)
|
|
|
|
def preproc_remover(text):
|
|
t = False
|
|
ret = ""
|
|
for l in text.split('\n'):
|
|
if l.lstrip() == '':
|
|
continue
|
|
if not '#' == l.lstrip()[0] and False == t:
|
|
ret += l + "\n"
|
|
else:
|
|
t = False
|
|
if '\\' == l[-1]:
|
|
t = True
|
|
return ret
|
|
|
|
def main():
|
|
fd = open(sys.argv[1], 'r')
|
|
src = fd.read()
|
|
fd.close()
|
|
|
|
src = comment_remover(src)
|
|
src = preproc_remover(src)
|
|
src = src.replace('\n', ' ')
|
|
c_expr = []
|
|
for s in re.split("([\{\};:])", src):
|
|
if s not in ['{', '}', ';', ':']:
|
|
c_expr.append(s)
|
|
else:
|
|
c_expr[-1] += s
|
|
|
|
# create a dataframe of the c_expr
|
|
c_expr_df = pd.DataFrame()
|
|
for e in c_expr:
|
|
ser = pd.Series({'{': e.count('{'),
|
|
'(': e.count('('),
|
|
# counts only the spaces before the first bracelet
|
|
'space_before_(' : e.split("(")[0].strip().count(' '),
|
|
'c_expr' : e,
|
|
})
|
|
c_expr_df = pd.concat([c_expr_df, ser.to_frame().T])
|
|
|
|
c_expr_df['space_before_('] = c_expr_df.apply(lambda row: 0 if 0 == row['('] else row['space_before_('],
|
|
axis=1)
|
|
data = c_expr_df
|
|
data = data.drop(['c_expr'], axis=1)
|
|
data = data.drop(['('], axis=1)
|
|
data['sb(>0'] = data.apply(lambda row: 0 if 0 == row['space_before_('] else 1, axis=1)
|
|
|
|
pickleFd = open(sys.argv[2]+'cluster.pkl', "rb")
|
|
pipe = pickle.load(pickleFd)
|
|
pickleFd.close()
|
|
|
|
c_expr_df['cluster'] = pipe.predict(data)
|
|
# print function definitions
|
|
for i, row in c_expr_df.iterrows():
|
|
if 3 == row['cluster']:
|
|
print (row['c_expr'])
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|