cHelper/cluster.ipynb

318 lines
9.8 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "9bbbd1e6-64a8-4301-81ea-a911315f58f2",
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import re\n",
"\n",
"fd = open(\"onion_crypto.c\", \"r\")\n",
"src = fd.read()\n",
"fd.close()\n",
"\n",
"# remove comments\n",
"# this function was taken from:\n",
"# https://stackoverflow.com/questions/241327/remove-c-and-c-comments-using-python\n",
"def comment_remover(text):\n",
" def replacer(match):\n",
" s = match.group(0)\n",
" if s.startswith('/'):\n",
" return \" \" # note: a space and not an empty string\n",
" else:\n",
" return s\n",
" pattern = re.compile(\n",
" r'//.*?$|/\\*.*?\\*/|\\'(?:\\\\.|[^\\\\\\'])*\\'|\"(?:\\\\.|[^\\\\\"])*\"',\n",
" re.DOTALL | re.MULTILINE\n",
" )\n",
" return re.sub(pattern, replacer, text)\n",
"\n",
"src = comment_remover(src)\n",
"\n",
"\n",
"# remove preprocessor macros\n",
"def preproc_remover(text):\n",
" t = False\n",
" ret = \"\"\n",
" for l in text.split('\\n'):\n",
" if l.lstrip() == '':\n",
" continue\n",
" if not '#' == l.lstrip()[0] and False == t:\n",
" ret += l + \"\\n\"\n",
" else:\n",
" t = False\n",
" if '\\\\' == l[-1]:\n",
" t = True\n",
" return ret\n",
"\n",
"src = preproc_remover(src)\n",
"\n",
"\n",
"src = src.replace('\\n', ' ')\n",
"c_expr = []\n",
"for s in re.split(\"([\\{\\};:])\", src):\n",
" if s not in ['{', '}', ';', ':']:\n",
" c_expr.append(s)\n",
" else:\n",
" c_expr[-1] += s\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b72627a9-2bd9-432f-aa97-3c4b1c035efa",
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>{</th>\n",
" <th>(</th>\n",
" <th>space_before_(</th>\n",
" <th>c_expr</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>static const uint8_t NTOR3_CIRC_VERIFICATION[]...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>static const size_t NTOR3_CIRC_VERIFICATION_L...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>server_onion_keys_t * server_onion_keys_new(v...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>server_onion_keys_t *keys = tor_malloc_zero...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>memcpy(keys-&gt;my_identity, router_get_my_id_...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" { ( space_before_( c_expr\n",
"0 0 0 0 static const uint8_t NTOR3_CIRC_VERIFICATION[]...\n",
"0 0 0 0 static const size_t NTOR3_CIRC_VERIFICATION_L...\n",
"0 1 1 2 server_onion_keys_t * server_onion_keys_new(v...\n",
"0 0 2 3 server_onion_keys_t *keys = tor_malloc_zero...\n",
"0 0 2 0 memcpy(keys->my_identity, router_get_my_id_..."
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"# create a dataframe of the c_expr\n",
"c_expr_df = pd.DataFrame()\n",
"for e in c_expr:\n",
" ser = pd.Series({'{': e.count('{'),\n",
" '(': e.count('('),\n",
" # counts only the spaces before the first bracelet\n",
" 'space_before_(' : e.split(\"(\")[0].strip().count(' '),\n",
" 'c_expr' : e,\n",
" })\n",
" c_expr_df = pd.concat([c_expr_df, ser.to_frame().T])\n",
" \n",
"c_expr_df['space_before_('] = c_expr_df.apply(lambda row: 0 if 0 == row['('] else row['space_before_('], axis=1)\n",
"# makes the space_before_( more important for the clustering algo\n",
"\n",
"c_expr_df.head() "
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "add32db0-81c5-4914-8050-2da4afa48340",
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>cluster</th>\n",
" <th>c_expr</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>server_onion_keys_t * server_onion_keys_new(vo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>void server_onion_keys_free_(server_onion_keys...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>void onion_handshake_state_release(onion_hands...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>int onion_skin_create(int type, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>static int negotiate_v3_ntor_server_circ_param...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>int onion_skin_server_handshake(int type, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>static int negotiate_v3_ntor_client_circ_param...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>int onion_skin_client_handshake(int type, ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" cluster c_expr\n",
"0 3 server_onion_keys_t * server_onion_keys_new(vo...\n",
"0 3 void server_onion_keys_free_(server_onion_keys...\n",
"0 3 void onion_handshake_state_release(onion_hands...\n",
"0 3 int onion_skin_create(int type, ...\n",
"0 3 static int negotiate_v3_ntor_server_circ_param...\n",
"0 3 int onion_skin_server_handshake(int type, ...\n",
"0 3 static int negotiate_v3_ntor_client_circ_param...\n",
"0 3 int onion_skin_client_handshake(int type, ..."
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.cluster import KMeans\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
"import pickle\n",
"\n",
"data = c_expr_df\n",
"data = data.drop(['c_expr'], axis=1)\n",
"data = data.drop(['('], axis=1)\n",
"data['sb(>0'] = data.apply(lambda row: 0 if 0 == row['space_before_('] else 1, axis=1)\n",
"\n",
"pipe = Pipeline([\n",
" (\"scaler\", StandardScaler()),\n",
" (\"cluster\", KMeans(random_state=1, n_clusters=4)),\n",
" ])\n",
"\n",
"data['cluster'] = pipe.fit_predict(data)\n",
"\n",
"outFd = open('cluster.pkl', 'wb')\n",
"pickle.dump(pipe, outFd)\n",
"outFd.close()\n",
"\n",
"data['c_expr'] = c_expr_df['c_expr'].apply(lambda x: x.lstrip())\n",
"data.loc[data['cluster'] == 3, ['cluster', 'c_expr']]"
]
}
],
"metadata": {
"kernelspec": {
"argv": [
"python",
"-m",
"ipykernel_launcher",
"-f",
"{connection_file}"
],
"display_name": "Python 3 (ipykernel)",
"env": null,
"interrupt_mode": "signal",
"language": "python",
"metadata": {
"debugger": true
},
"name": "python3"
},
"name": "cluster.ipynb"
},
"nbformat": 4,
"nbformat_minor": 5
}