cHelper/main.py

89 lines
2.5 KiB
Python

import sys
import pickle
import re
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
# remove comments
# this function was taken from:
# https://stackoverflow.com/questions/241327/remove-c-and-c-comments-using-python
def comment_remover(text):
def replacer(match):
s = match.group(0)
if s.startswith('/'):
return " " # note: a space and not an empty string
else:
return s
pattern = re.compile(
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE
)
return re.sub(pattern, replacer, text)
def preproc_remover(text):
t = False
ret = ""
for l in text.split('\n'):
if l.lstrip() == '':
continue
if not '#' == l.lstrip()[0] and False == t:
ret += l + "\n"
else:
t = False
if '\\' == l[-1]:
t = True
return ret
def main():
fd = open(sys.argv[1], 'r')
src = fd.read()
fd.close()
src = comment_remover(src)
src = preproc_remover(src)
src = src.replace('\n', ' ')
c_expr = []
for s in re.split("([\{\};:])", src):
if s not in ['{', '}', ';', ':']:
c_expr.append(s)
else:
c_expr[-1] += s
# create a dataframe of the c_expr
c_expr_df = pd.DataFrame()
for e in c_expr:
ser = pd.Series({'{': e.count('{'),
'(': e.count('('),
# counts only the spaces before the first bracelet
'space_before_(' : e.split("(")[0].strip().count(' '),
'c_expr' : e,
})
c_expr_df = pd.concat([c_expr_df, ser.to_frame().T])
c_expr_df['space_before_('] = c_expr_df.apply(lambda row: 0 if 0 == row['('] else row['space_before_('],
axis=1)
data = c_expr_df
data = data.drop(['c_expr'], axis=1)
data = data.drop(['('], axis=1)
data['sb(>0'] = data.apply(lambda row: 0 if 0 == row['space_before_('] else 1, axis=1)
pickleFd = open(sys.argv[2]+'cluster.pkl', "rb")
pipe = pickle.load(pickleFd)
pickleFd.close()
c_expr_df['cluster'] = pipe.predict(data)
# print function definitions
for i, row in c_expr_df.iterrows():
if 3 == row['cluster']:
print (row['c_expr'])
if __name__ == "__main__":
main()