|
6 | 6 | "source": [
|
7 | 7 | "# Natural Language Processing with Deep Learning\n",
|
8 | 8 | "\n",
|
| 9 | + "__This is exactly the same notebook as in ../week10_textconv/. Feel free submit the seminar notebook, just make sure you read the assignments at the end.\n", |
| 10 | + "\n", |
9 | 11 | "Today we're gonna apply the newly learned DL tools for sequence processing to the task of predicting job salary.\n",
|
10 | 12 | "\n",
|
11 | 13 | "Special thanks to [Oleg Vasilev](https://github.com/Omrigan/) for the assignment core (orignally written for theano/tensorflow)."
|
|
50 | 52 | {
|
51 | 53 | "cell_type": "code",
|
52 | 54 | "execution_count": null,
|
53 |
| - "metadata": {}, |
| 55 | + "metadata": { |
| 56 | + "collapsed": true |
| 57 | + }, |
54 | 58 | "outputs": [],
|
55 | 59 | "source": [
|
56 | 60 | "data = pd.read_csv(\"./Train_rev1.csv\", index_col=None)\n",
|
|
78 | 82 | {
|
79 | 83 | "cell_type": "code",
|
80 | 84 | "execution_count": null,
|
81 |
| - "metadata": {}, |
| 85 | + "metadata": { |
| 86 | + "collapsed": true |
| 87 | + }, |
82 | 88 | "outputs": [],
|
83 | 89 | "source": [
|
84 | 90 | "print(\"Before\")\n",
|
|
110 | 116 | {
|
111 | 117 | "cell_type": "code",
|
112 | 118 | "execution_count": null,
|
113 |
| - "metadata": {}, |
| 119 | + "metadata": { |
| 120 | + "collapsed": true |
| 121 | + }, |
114 | 122 | "outputs": [],
|
115 | 123 | "source": [
|
116 | 124 | "print(\"After\")\n",
|
|
144 | 152 | {
|
145 | 153 | "cell_type": "code",
|
146 | 154 | "execution_count": null,
|
147 |
| - "metadata": {}, |
| 155 | + "metadata": { |
| 156 | + "collapsed": true |
| 157 | + }, |
148 | 158 | "outputs": [],
|
149 | 159 | "source": [
|
150 | 160 | "print(\"Total unique tokens :\", len(token_counts))\n",
|
|
160 | 170 | {
|
161 | 171 | "cell_type": "code",
|
162 | 172 | "execution_count": null,
|
163 |
| - "metadata": {}, |
| 173 | + "metadata": { |
| 174 | + "collapsed": true |
| 175 | + }, |
164 | 176 | "outputs": [],
|
165 | 177 | "source": [
|
166 | 178 | "# Let's see how many words are there for each count\n",
|
|
197 | 209 | {
|
198 | 210 | "cell_type": "code",
|
199 | 211 | "execution_count": null,
|
200 |
| - "metadata": {}, |
| 212 | + "metadata": { |
| 213 | + "collapsed": true |
| 214 | + }, |
201 | 215 | "outputs": [],
|
202 | 216 | "source": [
|
203 | 217 | "print(\"Tokens left:\", len(tokens))\n",
|
|
229 | 243 | {
|
230 | 244 | "cell_type": "code",
|
231 | 245 | "execution_count": null,
|
232 |
| - "metadata": {}, |
| 246 | + "metadata": { |
| 247 | + "collapsed": true |
| 248 | + }, |
233 | 249 | "outputs": [],
|
234 | 250 | "source": [
|
235 | 251 | "assert isinstance(token_to_id, dict)\n",
|
|
275 | 291 | {
|
276 | 292 | "cell_type": "code",
|
277 | 293 | "execution_count": null,
|
278 |
| - "metadata": {}, |
| 294 | + "metadata": { |
| 295 | + "collapsed": true |
| 296 | + }, |
279 | 297 | "outputs": [],
|
280 | 298 | "source": [
|
281 | 299 | "#### print(\"Lines:\")\n",
|
|
296 | 314 | {
|
297 | 315 | "cell_type": "code",
|
298 | 316 | "execution_count": null,
|
299 |
| - "metadata": {}, |
| 317 | + "metadata": { |
| 318 | + "collapsed": true |
| 319 | + }, |
300 | 320 | "outputs": [],
|
301 | 321 | "source": [
|
302 | 322 | "from sklearn.feature_extraction import DictVectorizer\n",
|
|
326 | 346 | {
|
327 | 347 | "cell_type": "code",
|
328 | 348 | "execution_count": null,
|
329 |
| - "metadata": {}, |
| 349 | + "metadata": { |
| 350 | + "collapsed": true |
| 351 | + }, |
330 | 352 | "outputs": [],
|
331 | 353 | "source": [
|
332 | 354 | "from sklearn.model_selection import train_test_split\n",
|
|
368 | 390 | {
|
369 | 391 | "cell_type": "code",
|
370 | 392 | "execution_count": null,
|
371 |
| - "metadata": {}, |
| 393 | + "metadata": { |
| 394 | + "collapsed": true |
| 395 | + }, |
372 | 396 | "outputs": [],
|
373 | 397 | "source": [
|
374 | 398 | "generate_batch(data_train, 3, max_len=10)"
|
|
457 | 481 | {
|
458 | 482 | "cell_type": "code",
|
459 | 483 | "execution_count": null,
|
460 |
| - "metadata": {}, |
| 484 | + "metadata": { |
| 485 | + "collapsed": true |
| 486 | + }, |
461 | 487 | "outputs": [],
|
462 | 488 | "source": [
|
463 | 489 | "title_encoder = TitleEncoder(out_size=64)\n",
|
|
495 | 521 | {
|
496 | 522 | "cell_type": "code",
|
497 | 523 | "execution_count": null,
|
498 |
| - "metadata": {}, |
| 524 | + "metadata": { |
| 525 | + "collapsed": true |
| 526 | + }, |
499 | 527 | "outputs": [],
|
500 | 528 | "source": [
|
501 | 529 | "desc_encoder = <Create description encoder>\n",
|
|
686 | 714 | {
|
687 | 715 | "cell_type": "code",
|
688 | 716 | "execution_count": null,
|
689 |
| - "metadata": {}, |
| 717 | + "metadata": { |
| 718 | + "collapsed": true |
| 719 | + }, |
690 | 720 | "outputs": [],
|
691 | 721 | "source": [
|
692 | 722 | "for epoch_i in range(num_epochs):\n",
|
|
739 | 769 | " print('\\n\\n')"
|
740 | 770 | ]
|
741 | 771 | },
|
| 772 | + { |
| 773 | + "cell_type": "code", |
| 774 | + "execution_count": null, |
| 775 | + "metadata": { |
| 776 | + "collapsed": true |
| 777 | + }, |
| 778 | + "outputs": [], |
| 779 | + "source": [ |
| 780 | + "print(\"Final eval:\")\n", |
| 781 | + "for batch in iterate_minibatches(data_val, shuffle=False):\n", |
| 782 | + " title_ix = Variable(torch.LongTensor(batch[\"Title\"]), volatile=True)\n", |
| 783 | + " desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]), volatile=True)\n", |
| 784 | + " cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]), volatile=True)\n", |
| 785 | + " reference = Variable(torch.FloatTensor(batch[target_column]), volatile=True)\n", |
| 786 | + "\n", |
| 787 | + " prediction = model(title_ix, desc_ix, cat_features)\n", |
| 788 | + " loss = compute_loss(reference, prediction)\n", |
| 789 | + "\n", |
| 790 | + " val_loss += loss.data.numpy()[0]\n", |
| 791 | + " val_mae += compute_mae(reference, prediction).data.numpy()[0]\n", |
| 792 | + " val_batches += 1\n", |
| 793 | + "\n", |
| 794 | + "print(\"\\tLoss:\\t%.5f\" % (val_loss / val_batches))\n", |
| 795 | + "print(\"\\tMAE:\\t%.5f\" % (val_mae / val_batches))\n", |
| 796 | + "print('\\n\\n')" |
| 797 | + ] |
| 798 | + }, |
742 | 799 | {
|
743 | 800 | "cell_type": "markdown",
|
744 | 801 | "metadata": {},
|
|
806 | 863 | " * Maintain the best-on-validation snapshot via `model.state_dict`\n",
|
807 | 864 | " * Plotting learning curves is usually a good idea"
|
808 | 865 | ]
|
| 866 | + }, |
| 867 | + { |
| 868 | + "cell_type": "markdown", |
| 869 | + "metadata": {}, |
| 870 | + "source": [ |
| 871 | + "### A short report\n", |
| 872 | + "\n", |
| 873 | + "Please tell us what you did and how did it work.\n", |
| 874 | + "\n", |
| 875 | + "`<YOUR_TEXT_HERE>`, i guess..." |
| 876 | + ] |
| 877 | + }, |
| 878 | + { |
| 879 | + "cell_type": "code", |
| 880 | + "execution_count": null, |
| 881 | + "metadata": { |
| 882 | + "collapsed": true |
| 883 | + }, |
| 884 | + "outputs": [], |
| 885 | + "source": [] |
809 | 886 | }
|
810 | 887 | ],
|
811 | 888 | "metadata": {
|
|
0 commit comments