cluster is now written into a pickle and you can use it with main.py

to print all function definitions of a c file with:
python main.py <src.c>
This commit is contained in:
eisbaer 2022-10-01 14:42:32 +02:00
parent c4f6468cb4
commit ee3243f1e6
3 changed files with 102 additions and 9 deletions

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 110,
"execution_count": 2,
"id": "9bbbd1e6-64a8-4301-81ea-a911315f58f2",
"metadata": {
"collapsed": false
@ -13,7 +13,7 @@
"\n",
"fd = open(\"onion_crypto.c\", \"r\")\n",
"src = fd.read()\n",
"\n",
"fd.close()\n",
"\n",
"# remove comments\n",
"# this function was taken from:\n",
@ -35,10 +35,10 @@
"\n",
"\n",
"# remove preprocessor macros\n",
"t = False\n",
"def preproc_remover(text):\n",
" t = False\n",
" ret = \"\"\n",
" for l in src.split('\\n'):\n",
" for l in text.split('\\n'):\n",
" if l.lstrip() == '':\n",
" continue\n",
" if not '#' == l.lstrip()[0] and False == t:\n",
@ -63,7 +63,7 @@
},
{
"cell_type": "code",
"execution_count": 111,
"execution_count": 4,
"id": "b72627a9-2bd9-432f-aa97-3c4b1c035efa",
"metadata": {
"collapsed": false
@ -145,7 +145,7 @@
"0 0 2 0 memcpy(keys->my_identity, router_get_my_id_..."
]
},
"execution_count": 111,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@ -172,7 +172,7 @@
},
{
"cell_type": "code",
"execution_count": 116,
"execution_count": 5,
"id": "add32db0-81c5-4914-8050-2da4afa48340",
"metadata": {
"collapsed": false
@ -260,7 +260,7 @@
"0 3 int onion_skin_client_handshake(int type, ..."
]
},
"execution_count": 116,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@ -269,6 +269,7 @@
"from sklearn.cluster import KMeans\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
"import pickle\n",
"\n",
"data = c_expr_df\n",
"data = data.drop(['c_expr'], axis=1)\n",
@ -281,8 +282,12 @@
" ])\n",
"\n",
"data['cluster'] = pipe.fit_predict(data)\n",
"data['c_expr'] = c_expr_df['c_expr'].apply(lambda x: x.lstrip())\n",
"\n",
"outFd = open('cluster.pkl', 'wb')\n",
"pickle.dump(pipe, outFd)\n",
"outFd.close()\n",
"\n",
"data['c_expr'] = c_expr_df['c_expr'].apply(lambda x: x.lstrip())\n",
"data.loc[data['cluster'] == 3, ['cluster', 'c_expr']]"
]
}

BIN
cluster.pkl Normal file

Binary file not shown.

88
main.py Normal file
View File

@ -0,0 +1,88 @@
import sys
import pickle
import re
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
# remove comments
# this function was taken from:
# https://stackoverflow.com/questions/241327/remove-c-and-c-comments-using-python
def comment_remover(text):
def replacer(match):
s = match.group(0)
if s.startswith('/'):
return " " # note: a space and not an empty string
else:
return s
pattern = re.compile(
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE
)
return re.sub(pattern, replacer, text)
def preproc_remover(text):
t = False
ret = ""
for l in text.split('\n'):
if l.lstrip() == '':
continue
if not '#' == l.lstrip()[0] and False == t:
ret += l + "\n"
else:
t = False
if '\\' == l[-1]:
t = True
return ret
def main():
fd = open(sys.argv[1], 'r')
src = fd.read()
fd.close()
src = comment_remover(src)
src = preproc_remover(src)
src = src.replace('\n', ' ')
c_expr = []
for s in re.split("([\{\};:])", src):
if s not in ['{', '}', ';', ':']:
c_expr.append(s)
else:
c_expr[-1] += s
# create a dataframe of the c_expr
c_expr_df = pd.DataFrame()
for e in c_expr:
ser = pd.Series({'{': e.count('{'),
'(': e.count('('),
# counts only the spaces before the first bracelet
'space_before_(' : e.split("(")[0].strip().count(' '),
'c_expr' : e,
})
c_expr_df = pd.concat([c_expr_df, ser.to_frame().T])
c_expr_df['space_before_('] = c_expr_df.apply(lambda row: 0 if 0 == row['('] else row['space_before_('],
axis=1)
data = c_expr_df
data = data.drop(['c_expr'], axis=1)
data = data.drop(['('], axis=1)
data['sb(>0'] = data.apply(lambda row: 0 if 0 == row['space_before_('] else 1, axis=1)
pickleFd = open('cluster.pkl', "rb")
pipe = pickle.load(pickleFd)
pickleFd.close()
c_expr_df['cluster'] = pipe.predict(data)
# print function definitions
for i, row in c_expr_df.iterrows():
if 3 == row['cluster']:
print (row['c_expr'])
if __name__ == "__main__":
main()