add final eval

justheuristic · justheuristic · commit bcc45a59b83a · 2018-04-18T19:43:33.000+03:00
diff --git a/week10_textconv/Seminar-pytorch.ipynb b/week10_textconv/Seminar-pytorch.ipynb
@@ -6,6 +6,8 @@
    "source": [
     "# Natural Language Processing with Deep Learning\n",
     "\n",
+    "__This is exactly the same notebook as in ../week10_textconv/. Feel free submit the seminar notebook, just make sure you read the assignments at the end.\n",
+    "\n",
     "Today we're gonna apply the newly learned DL tools for sequence processing to the task of predicting job salary.\n",
     "\n",
     "Special thanks to [Oleg Vasilev](https://github.com/Omrigan/) for the assignment core (orignally written for theano/tensorflow)."
@@ -50,7 +52,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "data = pd.read_csv(\"./Train_rev1.csv\", index_col=None)\n",
@@ -78,7 +82,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "print(\"Before\")\n",
@@ -110,7 +116,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "print(\"After\")\n",
@@ -144,7 +152,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "print(\"Total unique tokens :\", len(token_counts))\n",
@@ -160,7 +170,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "# Let's see how many words are there for each count\n",
@@ -197,7 +209,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "print(\"Tokens left:\", len(tokens))\n",
@@ -229,7 +243,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "assert isinstance(token_to_id, dict)\n",
@@ -275,7 +291,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "#### print(\"Lines:\")\n",
@@ -296,7 +314,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from sklearn.feature_extraction import DictVectorizer\n",
@@ -326,7 +346,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from sklearn.model_selection import train_test_split\n",
@@ -368,7 +390,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "generate_batch(data_train, 3, max_len=10)"
@@ -457,7 +481,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "title_encoder = TitleEncoder(out_size=64)\n",
@@ -495,7 +521,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "desc_encoder = <Create description encoder>\n",
@@ -686,7 +714,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "for epoch_i in range(num_epochs):\n",
@@ -739,6 +769,33 @@
     "    print('\\n\\n')"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "print(\"Final eval:\")\n",
+    "for batch in iterate_minibatches(data_val, shuffle=False):\n",
+    "    title_ix = Variable(torch.LongTensor(batch[\"Title\"]), volatile=True)\n",
+    "    desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]), volatile=True)\n",
+    "    cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]), volatile=True)\n",
+    "    reference = Variable(torch.FloatTensor(batch[target_column]), volatile=True)\n",
+    "\n",
+    "    prediction = model(title_ix, desc_ix, cat_features)\n",
+    "    loss = compute_loss(reference, prediction)\n",
+    "\n",
+    "    val_loss += loss.data.numpy()[0]\n",
+    "    val_mae += compute_mae(reference, prediction).data.numpy()[0]\n",
+    "    val_batches += 1\n",
+    "\n",
+    "print(\"\\tLoss:\\t%.5f\" % (val_loss / val_batches))\n",
+    "print(\"\\tMAE:\\t%.5f\" % (val_mae / val_batches))\n",
+    "print('\\n\\n')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -806,6 +863,26 @@
     "  * Maintain the best-on-validation snapshot via `model.state_dict`\n",
     "  * Plotting learning curves is usually a good idea"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### A short report\n",
+    "\n",
+    "Please tell us what you did and how did it work.\n",
+    "\n",
+    "`<YOUR_TEXT_HERE>`, i guess..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {