318 lines
9.8 KiB
Plaintext
318 lines
9.8 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "9bbbd1e6-64a8-4301-81ea-a911315f58f2",
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import re\n",
|
|
"\n",
|
|
"fd = open(\"onion_crypto.c\", \"r\")\n",
|
|
"src = fd.read()\n",
|
|
"fd.close()\n",
|
|
"\n",
|
|
"# remove comments\n",
|
|
"# this function was taken from:\n",
|
|
"# https://stackoverflow.com/questions/241327/remove-c-and-c-comments-using-python\n",
|
|
"def comment_remover(text):\n",
|
|
" def replacer(match):\n",
|
|
" s = match.group(0)\n",
|
|
" if s.startswith('/'):\n",
|
|
" return \" \" # note: a space and not an empty string\n",
|
|
" else:\n",
|
|
" return s\n",
|
|
" pattern = re.compile(\n",
|
|
" r'//.*?$|/\\*.*?\\*/|\\'(?:\\\\.|[^\\\\\\'])*\\'|\"(?:\\\\.|[^\\\\\"])*\"',\n",
|
|
" re.DOTALL | re.MULTILINE\n",
|
|
" )\n",
|
|
" return re.sub(pattern, replacer, text)\n",
|
|
"\n",
|
|
"src = comment_remover(src)\n",
|
|
"\n",
|
|
"\n",
|
|
"# remove preprocessor macros\n",
|
|
"def preproc_remover(text):\n",
|
|
" t = False\n",
|
|
" ret = \"\"\n",
|
|
" for l in text.split('\\n'):\n",
|
|
" if l.lstrip() == '':\n",
|
|
" continue\n",
|
|
" if not '#' == l.lstrip()[0] and False == t:\n",
|
|
" ret += l + \"\\n\"\n",
|
|
" else:\n",
|
|
" t = False\n",
|
|
" if '\\\\' == l[-1]:\n",
|
|
" t = True\n",
|
|
" return ret\n",
|
|
"\n",
|
|
"src = preproc_remover(src)\n",
|
|
"\n",
|
|
"\n",
|
|
"src = src.replace('\\n', ' ')\n",
|
|
"c_expr = []\n",
|
|
"for s in re.split(\"([\\{\\};:])\", src):\n",
|
|
" if s not in ['{', '}', ';', ':']:\n",
|
|
" c_expr.append(s)\n",
|
|
" else:\n",
|
|
" c_expr[-1] += s\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "b72627a9-2bd9-432f-aa97-3c4b1c035efa",
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>{</th>\n",
|
|
" <th>(</th>\n",
|
|
" <th>space_before_(</th>\n",
|
|
" <th>c_expr</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>static const uint8_t NTOR3_CIRC_VERIFICATION[]...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>static const size_t NTOR3_CIRC_VERIFICATION_L...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>server_onion_keys_t * server_onion_keys_new(v...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>server_onion_keys_t *keys = tor_malloc_zero...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>memcpy(keys->my_identity, router_get_my_id_...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" { ( space_before_( c_expr\n",
|
|
"0 0 0 0 static const uint8_t NTOR3_CIRC_VERIFICATION[]...\n",
|
|
"0 0 0 0 static const size_t NTOR3_CIRC_VERIFICATION_L...\n",
|
|
"0 1 1 2 server_onion_keys_t * server_onion_keys_new(v...\n",
|
|
"0 0 2 3 server_onion_keys_t *keys = tor_malloc_zero...\n",
|
|
"0 0 2 0 memcpy(keys->my_identity, router_get_my_id_..."
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"# create a dataframe of the c_expr\n",
|
|
"c_expr_df = pd.DataFrame()\n",
|
|
"for e in c_expr:\n",
|
|
" ser = pd.Series({'{': e.count('{'),\n",
|
|
" '(': e.count('('),\n",
|
|
" # counts only the spaces before the first bracelet\n",
|
|
" 'space_before_(' : e.split(\"(\")[0].strip().count(' '),\n",
|
|
" 'c_expr' : e,\n",
|
|
" })\n",
|
|
" c_expr_df = pd.concat([c_expr_df, ser.to_frame().T])\n",
|
|
" \n",
|
|
"c_expr_df['space_before_('] = c_expr_df.apply(lambda row: 0 if 0 == row['('] else row['space_before_('], axis=1)\n",
|
|
"# makes the space_before_( more important for the clustering algo\n",
|
|
"\n",
|
|
"c_expr_df.head() "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "add32db0-81c5-4914-8050-2da4afa48340",
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>cluster</th>\n",
|
|
" <th>c_expr</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>3</td>\n",
|
|
" <td>server_onion_keys_t * server_onion_keys_new(vo...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>3</td>\n",
|
|
" <td>void server_onion_keys_free_(server_onion_keys...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>3</td>\n",
|
|
" <td>void onion_handshake_state_release(onion_hands...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>3</td>\n",
|
|
" <td>int onion_skin_create(int type, ...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>3</td>\n",
|
|
" <td>static int negotiate_v3_ntor_server_circ_param...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>3</td>\n",
|
|
" <td>int onion_skin_server_handshake(int type, ...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>3</td>\n",
|
|
" <td>static int negotiate_v3_ntor_client_circ_param...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>3</td>\n",
|
|
" <td>int onion_skin_client_handshake(int type, ...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" cluster c_expr\n",
|
|
"0 3 server_onion_keys_t * server_onion_keys_new(vo...\n",
|
|
"0 3 void server_onion_keys_free_(server_onion_keys...\n",
|
|
"0 3 void onion_handshake_state_release(onion_hands...\n",
|
|
"0 3 int onion_skin_create(int type, ...\n",
|
|
"0 3 static int negotiate_v3_ntor_server_circ_param...\n",
|
|
"0 3 int onion_skin_server_handshake(int type, ...\n",
|
|
"0 3 static int negotiate_v3_ntor_client_circ_param...\n",
|
|
"0 3 int onion_skin_client_handshake(int type, ..."
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn.cluster import KMeans\n",
|
|
"from sklearn.pipeline import Pipeline\n",
|
|
"from sklearn.preprocessing import StandardScaler\n",
|
|
"import pickle\n",
|
|
"\n",
|
|
"data = c_expr_df\n",
|
|
"data = data.drop(['c_expr'], axis=1)\n",
|
|
"data = data.drop(['('], axis=1)\n",
|
|
"data['sb(>0'] = data.apply(lambda row: 0 if 0 == row['space_before_('] else 1, axis=1)\n",
|
|
"\n",
|
|
"pipe = Pipeline([\n",
|
|
" (\"scaler\", StandardScaler()),\n",
|
|
" (\"cluster\", KMeans(random_state=1, n_clusters=4)),\n",
|
|
" ])\n",
|
|
"\n",
|
|
"data['cluster'] = pipe.fit_predict(data)\n",
|
|
"\n",
|
|
"outFd = open('cluster.pkl', 'wb')\n",
|
|
"pickle.dump(pipe, outFd)\n",
|
|
"outFd.close()\n",
|
|
"\n",
|
|
"data['c_expr'] = c_expr_df['c_expr'].apply(lambda x: x.lstrip())\n",
|
|
"data.loc[data['cluster'] == 3, ['cluster', 'c_expr']]"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"argv": [
|
|
"python",
|
|
"-m",
|
|
"ipykernel_launcher",
|
|
"-f",
|
|
"{connection_file}"
|
|
],
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"env": null,
|
|
"interrupt_mode": "signal",
|
|
"language": "python",
|
|
"metadata": {
|
|
"debugger": true
|
|
},
|
|
"name": "python3"
|
|
},
|
|
"name": "cluster.ipynb"
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|