cluster is now written into a pickle and you can use it with main.py

to print all function definitions of a c file with: python main.py <src.c>
2022-10-01 14:42:32 +02:00 · 2022-10-01 14:42:32 +02:00 · ee3243f1e6
parent c4f6468cb4
commit ee3243f1e6
3 changed files with 102 additions and 9 deletions
--- a/cluster.ipynb
+++ b/cluster.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 110,
+   "execution_count": 2,
   "id": "9bbbd1e6-64a8-4301-81ea-a911315f58f2",
   "metadata": {
    "collapsed": false
@ -13,7 +13,7 @@
    "\n",
    "fd = open(\"onion_crypto.c\", \"r\")\n",
    "src = fd.read()\n",
-    "\n",
+    "fd.close()\n",
    "\n",
    "# remove comments\n",
    "# this function was taken from:\n",
@ -35,10 +35,10 @@
    "\n",
    "\n",
    "# remove preprocessor macros\n",
-    "t = False\n",
    "def preproc_remover(text):\n",
+    "    t = False\n",
    "    ret = \"\"\n",
-    "    for l in src.split('\\n'):\n",
+    "    for l in text.split('\\n'):\n",
    "        if l.lstrip() == '':\n",
    "            continue\n",
    "        if not '#' == l.lstrip()[0] and False == t:\n",
@ -63,7 +63,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": 4,
   "id": "b72627a9-2bd9-432f-aa97-3c4b1c035efa",
   "metadata": {
    "collapsed": false
@ -145,7 +145,7 @@
       "0  0  2               0     memcpy(keys->my_identity, router_get_my_id_..."
      ]
     },
-     "execution_count": 111,
+     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -172,7 +172,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 116,
+   "execution_count": 5,
   "id": "add32db0-81c5-4914-8050-2da4afa48340",
   "metadata": {
    "collapsed": false
@ -260,7 +260,7 @@
       "0        3  int onion_skin_client_handshake(int type,     ..."
      ]
     },
-     "execution_count": 116,
+     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -269,6 +269,7 @@
    "from sklearn.cluster import KMeans\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
+    "import pickle\n",
    "\n",
    "data = c_expr_df\n",
    "data = data.drop(['c_expr'], axis=1)\n",
@ -281,8 +282,12 @@
    "    ])\n",
    "\n",
    "data['cluster'] = pipe.fit_predict(data)\n",
-    "data['c_expr'] = c_expr_df['c_expr'].apply(lambda x: x.lstrip())\n",
    "\n",
+    "outFd = open('cluster.pkl', 'wb')\n",
+    "pickle.dump(pipe, outFd)\n",
+    "outFd.close()\n",
+    "\n",
+    "data['c_expr'] = c_expr_df['c_expr'].apply(lambda x: x.lstrip())\n",
    "data.loc[data['cluster'] == 3, ['cluster', 'c_expr']]"
   ]
  }
--- a/cluster.pkl
+++ b/cluster.pkl
--- a/main.py
+++ b/main.py
@ -0,0 +1,88 @@
+import sys
+import pickle
+import re
+from sklearn.cluster import KMeans
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+import pandas as pd
+
+
+# remove comments
+# this function was taken from:
+# https://stackoverflow.com/questions/241327/remove-c-and-c-comments-using-python
+def comment_remover(text):
+    def replacer(match):
+        s = match.group(0)
+        if s.startswith('/'):
+            return " " # note: a space and not an empty string
+        else:
+            return s
+    pattern = re.compile(
+        r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
+        re.DOTALL | re.MULTILINE
+    )
+    return re.sub(pattern, replacer, text)
+
+def preproc_remover(text):
+    t = False
+    ret = ""
+    for l in text.split('\n'):
+        if l.lstrip() == '':
+            continue
+        if not '#' == l.lstrip()[0] and False == t:
+            ret += l + "\n"
+        else:
+            t = False
+            if '\\' == l[-1]:
+                t = True
+    return ret
+
+def main():
+    fd = open(sys.argv[1], 'r')
+    src = fd.read()
+    fd.close()
+
+    src = comment_remover(src)
+    src = preproc_remover(src)
+    src = src.replace('\n', ' ')
+    c_expr = []
+    for s in re.split("([\{\};:])", src):
+        if s not in ['{', '}', ';', ':']:
+            c_expr.append(s)
+        else:
+            c_expr[-1] += s
+
+    # create a dataframe of the c_expr
+    c_expr_df = pd.DataFrame()
+    for e in c_expr:
+        ser = pd.Series({'{': e.count('{'),
+                         '(': e.count('('),
+                         # counts only the spaces before the first bracelet
+                         'space_before_(' : e.split("(")[0].strip().count(' '),
+                         'c_expr' : e,
+                         })
+        c_expr_df = pd.concat([c_expr_df, ser.to_frame().T])
+
+    c_expr_df['space_before_('] = c_expr_df.apply(lambda row: 0 if 0 == row['('] else row['space_before_('],
+                                                  axis=1)
+    data = c_expr_df
+    data = data.drop(['c_expr'], axis=1)
+    data = data.drop(['('], axis=1)
+    data['sb(>0'] = data.apply(lambda row: 0 if 0 == row['space_before_('] else 1, axis=1)
+
+    pickleFd = open('cluster.pkl', "rb")
+    pipe = pickle.load(pickleFd)
+    pickleFd.close()
+
+    c_expr_df['cluster'] = pipe.predict(data)
+    # print function definitions
+    for i, row in c_expr_df.iterrows():
+        if 3 == row['cluster']:
+            print (row['c_expr'])
+
+
+
+
+
+if __name__ == "__main__":
+    main()