cHelper/cluster.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9bbbd1e6-64a8-4301-81ea-a911315f58f2",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "fd = open(\"onion_crypto.c\", \"r\")\n",
    "src = fd.read()\n",
    "fd.close()\n",
    "\n",
    "# remove comments\n",
    "# this function was taken from:\n",
    "# https://stackoverflow.com/questions/241327/remove-c-and-c-comments-using-python\n",
    "def comment_remover(text):\n",
    "    def replacer(match):\n",
    "        s = match.group(0)\n",
    "        if s.startswith('/'):\n",
    "            return \" \" # note: a space and not an empty string\n",
    "        else:\n",
    "            return s\n",
    "    pattern = re.compile(\n",
    "        r'//.*?$|/\\*.*?\\*/|\\'(?:\\\\.|[^\\\\\\'])*\\'|\"(?:\\\\.|[^\\\\\"])*\"',\n",
    "        re.DOTALL | re.MULTILINE\n",
    "    )\n",
    "    return re.sub(pattern, replacer, text)\n",
    "\n",
    "src = comment_remover(src)\n",
    "\n",
    "\n",
    "# remove preprocessor macros\n",
    "def preproc_remover(text):\n",
    "    t = False\n",
    "    ret = \"\"\n",
    "    for l in text.split('\\n'):\n",
    "        if l.lstrip() == '':\n",
    "            continue\n",
    "        if not '#' == l.lstrip()[0] and False == t:\n",
    "            ret += l + \"\\n\"\n",
    "        else:\n",
    "            t = False\n",
    "            if '\\\\' == l[-1]:\n",
    "                t = True\n",
    "    return ret\n",
    "\n",
    "src = preproc_remover(src)\n",
    "\n",
    "\n",
    "src = src.replace('\\n', ' ')\n",
    "c_expr = []\n",
    "for s in re.split(\"([\\{\\};:])\", src):\n",
    "    if s not in ['{', '}', ';', ':']:\n",
    "        c_expr.append(s)\n",
    "    else:\n",
    "        c_expr[-1] += s\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b72627a9-2bd9-432f-aa97-3c4b1c035efa",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>{</th>\n",
       "      <th>(</th>\n",
       "      <th>space_before_(</th>\n",
       "      <th>c_expr</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>static const uint8_t NTOR3_CIRC_VERIFICATION[]...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>static const size_t NTOR3_CIRC_VERIFICATION_L...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>server_onion_keys_t * server_onion_keys_new(v...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>server_onion_keys_t *keys = tor_malloc_zero...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>memcpy(keys-&gt;my_identity, router_get_my_id_...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   {  (  space_before_(                                             c_expr\n",
       "0  0  0               0  static const uint8_t NTOR3_CIRC_VERIFICATION[]...\n",
       "0  0  0               0   static const size_t NTOR3_CIRC_VERIFICATION_L...\n",
       "0  1  1               2   server_onion_keys_t * server_onion_keys_new(v...\n",
       "0  0  2               3     server_onion_keys_t *keys = tor_malloc_zero...\n",
       "0  0  2               0     memcpy(keys->my_identity, router_get_my_id_..."
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# create a dataframe of the c_expr\n",
    "c_expr_df = pd.DataFrame()\n",
    "for e in c_expr:\n",
    "    ser = pd.Series({'{': e.count('{'),\n",
    "                     '(': e.count('('),\n",
    "                     # counts only the spaces before the first bracelet\n",
    "                     'space_before_(' : e.split(\"(\")[0].strip().count(' '),\n",
    "                     'c_expr' : e,\n",
    "                     })\n",
    "    c_expr_df = pd.concat([c_expr_df, ser.to_frame().T])\n",
    "    \n",
    "c_expr_df['space_before_('] = c_expr_df.apply(lambda row: 0 if 0 == row['('] else row['space_before_('], axis=1)\n",
    "# makes the space_before_( more important for the clustering algo\n",
    "\n",
    "c_expr_df.head()                     "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "add32db0-81c5-4914-8050-2da4afa48340",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cluster</th>\n",
       "      <th>c_expr</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>server_onion_keys_t * server_onion_keys_new(vo...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>void server_onion_keys_free_(server_onion_keys...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>void onion_handshake_state_release(onion_hands...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>int onion_skin_create(int type,               ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>static int negotiate_v3_ntor_server_circ_param...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>int onion_skin_server_handshake(int type,     ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>static int negotiate_v3_ntor_client_circ_param...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>int onion_skin_client_handshake(int type,     ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   cluster                                             c_expr\n",
       "0        3  server_onion_keys_t * server_onion_keys_new(vo...\n",
       "0        3  void server_onion_keys_free_(server_onion_keys...\n",
       "0        3  void onion_handshake_state_release(onion_hands...\n",
       "0        3  int onion_skin_create(int type,               ...\n",
       "0        3  static int negotiate_v3_ntor_server_circ_param...\n",
       "0        3  int onion_skin_server_handshake(int type,     ...\n",
       "0        3  static int negotiate_v3_ntor_client_circ_param...\n",
       "0        3  int onion_skin_client_handshake(int type,     ..."
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "import pickle\n",
    "\n",
    "data = c_expr_df\n",
    "data = data.drop(['c_expr'], axis=1)\n",
    "data = data.drop(['('], axis=1)\n",
    "data['sb(>0'] = data.apply(lambda row: 0 if 0 == row['space_before_('] else 1, axis=1)\n",
    "\n",
    "pipe = Pipeline([\n",
    "    (\"scaler\", StandardScaler()),\n",
    "    (\"cluster\", KMeans(random_state=1, n_clusters=4)),\n",
    "    ])\n",
    "\n",
    "data['cluster'] = pipe.fit_predict(data)\n",
    "\n",
    "outFd = open('cluster.pkl', 'wb')\n",
    "pickle.dump(pipe, outFd)\n",
    "outFd.close()\n",
    "\n",
    "data['c_expr'] = c_expr_df['c_expr'].apply(lambda x: x.lstrip())\n",
    "data.loc[data['cluster'] == 3, ['cluster', 'c_expr']]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "argv": [
    "python",
    "-m",
    "ipykernel_launcher",
    "-f",
    "{connection_file}"
   ],
   "display_name": "Python 3 (ipykernel)",
   "env": null,
   "interrupt_mode": "signal",
   "language": "python",
   "metadata": {
    "debugger": true
   },
   "name": "python3"
  },
  "name": "cluster.ipynb"
 },
 "nbformat": 4,
 "nbformat_minor": 5
}