adashofdata · Hashem-Zarafat · Jun 11, 2020 · Jun 11, 2020 · Jun 11, 2020 · Jun 17, 2020
diff --git a/.ipynb_checkpoints/1-Data-Cleaning NEW-checkpoint.ipynb b/.ipynb_checkpoints/1-Data-Cleaning NEW-checkpoint.ipynb
@@ -0,0 +1,143 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data Cleaning"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Introduction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load pickled files\n",
+    "\n",
+    "data = {}\n",
+    "\n",
+    "with open(\"transcripts/DFI.txt\", \"rb\") as file:\n",
+    "    data = file.read().decode(\"utf-8\") \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cleaning The Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['dfi', 'should', 'be', 'assess', 'beyond', 'financi', 'perform', 'petal', 'jaya', ':', 'develop', 'financi', 'institut', '(', 'dfi', ')', 'should', 'be', 'assess', 'beyond', 'financi', 'perform', ',', 'by', 'look', 'at', 'the', 'deliver', 'of', 'their', 'mandat', 'role', '.', 'sme', 'develop', 'bank', 'malaysia', 'bhd', '(', 'sme', 'bank', ')', 'chief', 'oper', 'offic', ',', 'khairil', 'anuar', 'mohamad', 'anuar', 'said', 'the', 'dfi', '’', 'role', 'is', 'to', 'act', 'on', ',', 'or', 'to', 'drive', 'govern', 'agenda', ',', 'as', 'well', 'as', 'to', 'play', 'a', 'counter-cycl', 'role', 'to', 'support', 'and', 'push', 'the', 'economi', 'further', '.', '“', 'guidelin', 'by', 'bank', 'negara', 'to', 'dfi', 'in', 'assess', 'their', 'perform', 'are', 'not', 'onli', 'base', 'on', 'financi', 'but']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Apply a third round of cleaning: https://machinelearningmastery.com/clean-text-machine-learning-python/\n",
+    "# Tokenization and Cleaning with NLTK\n",
+    "\n",
+    "# split into words\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "tokens = word_tokenize(data)\n",
+    "# convert to lower case\n",
+    "tokens = [w.lower() for w in tokens]\n",
+    "# remove punctuation from each word\n",
+    "import string\n",
+    "table = str.maketrans('', '', string.punctuation)\n",
+    "stripped = [w.translate(table) for w in tokens]\n",
+    "# remove remaining tokens that are not alphabetic\n",
+    "words = [word for word in stripped if word.isalpha()]\n",
+    "# filter out stop words\n",
+    "from nltk.corpus import stopwords\n",
+    "stop_words = set(stopwords.words('english'))\n",
+    "words = [w for w in words if not w in stop_words]\n",
+    "\n",
+    "\n",
+    "tokens = word_tokenize(data)\n",
+    "\n",
+    "# stemming of words\n",
+    "from nltk.stem.porter import PorterStemmer\n",
+    "porter = PorterStemmer()\n",
+    "stemmed = [porter.stem(word) for word in tokens]\n",
+    "print(stemmed[:100])\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  },
+  "toc": {
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": "block",
+   "toc_window_display": false
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/.ipynb_checkpoints/1-Data-Cleaning TEXT-checkpoint.ipynb b/.ipynb_checkpoints/1-Data-Cleaning TEXT-checkpoint.ipynb
diff --git a/.ipynb_checkpoints/1-Data-Cleaning-checkpoint.ipynb b/.ipynb_checkpoints/1-Data-Cleaning-checkpoint.ipynb