Initial base

alexander-n-thomas · Sep 1, 2018 · 20b8f14 · 20b8f14
1 parent aa0b71d
commit 20b8f14
Show file tree

Hide file tree

Showing 24 changed files with 4,153 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,3 @@
+Based on Spark-NLP 1.6.2
+
+Requires Spark-NLP installed from PIP and pyspark environment (pyspark or spark from pip)
diff --git a/jupyter/crf-ner/ner.ipynb b/jupyter/crf-ner/ner.ipynb
@@ -0,0 +1,303 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Import appropriate modules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "sys.path.append('../../')\n",
+    "\n",
+    "from pyspark.sql import SparkSession\n",
+    "from pyspark.ml import Pipeline\n",
+    "\n",
+    "from sparknlp.annotator import *\n",
+    "from sparknlp.common import *\n",
+    "from sparknlp.base import *\n",
+    "\n",
+    "import time\n",
+    "import zipfile"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Download training dataset if not already there"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Download CoNLL 2003 Dataset\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "import urllib.request\n",
+    "\n",
+    "if not Path(\"eng.train\").is_file():\n",
+    "    url = \"https://github.com/patverga/torch-ner-nlp-from-scratch/raw/master/data/conll2003/eng.train\"\n",
+    "    urllib.request.urlretrieve(url, 'eng.train')\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Download Glove word embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Download Glove Word Embeddings\n",
+    "file = \"glove.6B.zip\"\n",
+    "if not Path(\"glove.6B.zip\").is_file():\n",
+    "    url = \"http://nlp.stanford.edu/data/glove.6B.zip\"\n",
+    "    print(\"Start downoading Glove Word Embeddings. It will take some time, please wait...\")\n",
+    "    urllib.request.urlretrieve(url, \"glove.6B.zip\")\n",
+    "    print(\"Downloading finished\")\n",
+    "    \n",
+    "if not Path(\"glove.6B.100d.txt\").is_file():\n",
+    "    zip_ref = zipfile.ZipFile(file, 'r')\n",
+    "    zip_ref.extractall(\"./\")\n",
+    "    zip_ref.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Start Spark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark = SparkSession.builder \\\n",
+    "    .appName(\"ner\")\\\n",
+    "    .master(\"local[1]\")\\\n",
+    "    .config(\"spark.driver.memory\",\"8G\")\\\n",
+    "    .config(\"spark.driver.maxResultSize\", \"2G\") \\\n",
+    "    .config(\"spark.driver.extraClassPath\", \"lib/sparknlp.jar\")\\\n",
+    "    .config(\"spark.kryoserializer.buffer.max\", \"500m\")\\\n",
+    "    .getOrCreate()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1. Download CoNLL2003 dataset\n",
+    "2. Save 3 files eng.train, eng.testa, eng.testa, into working dir ./"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create annotator components in the right order, with their training Params. Finisher will output only NER. Put all in pipeline."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "documentAssembler = DocumentAssembler()\\\n",
+    "  .setInputCol(\"text\")\\\n",
+    "  .setOutputCol(\"document\")\n",
+    "\n",
+    "sentenceDetector = SentenceDetector()\\\n",
+    "  .setInputCols([\"document\"])\\\n",
+    "  .setOutputCol(\"sentence\")\n",
+    "\n",
+    "tokenizer = Tokenizer()\\\n",
+    "  .setInputCols([\"document\"])\\\n",
+    "  .setOutputCol(\"token\")\n",
+    "\n",
+    "posTagger = PerceptronApproach()\\\n",
+    "  .setIterations(5)\\\n",
+    "  .setInputCols([\"token\", \"document\"])\\\n",
+    "  .setOutputCol(\"pos\")\\\n",
+    "  .setCorpus(\"file:///\" + os.getcwd() + \"/../../../src/test/resources/anc-pos-corpus-small/\", \"|\")\n",
+    "\n",
+    "nerTagger = NerCrfApproach()\\\n",
+    "  .setInputCols([\"sentence\", \"token\", \"pos\"])\\\n",
+    "  .setLabelColumn(\"label\")\\\n",
+    "  .setOutputCol(\"ner\")\\\n",
+    "  .setMinEpochs(1)\\\n",
+    "  .setMaxEpochs(5)\\\n",
+    "  .setLossEps(1e-3)\\\n",
+    "  .setEmbeddingsSource(\"glove.6B.100d.txt\", 100, 2)\\\n",
+    "  .setExternalFeatures(\"file:///\" + os.getcwd() + \"/../../../src/test/resources/ner-corpus/dict.txt\", \",\")\\\n",
+    "  .setExternalDataset(\"file:///\" + os.getcwd() + \"/eng.train\")\\\n",
+    "  .setL2(1)\\\n",
+    "  .setC0(1250000)\\\n",
+    "  .setRandomSeed(0)\\\n",
+    "  .setVerbose(2)\n",
+    "\n",
+    "finisher = Finisher() \\\n",
+    "    .setInputCols([\"ner\"]) \\\n",
+    "    .setIncludeKeys(True)\n",
+    "\n",
+    "pipeline = Pipeline(\n",
+    "    stages = [\n",
+    "    documentAssembler,\n",
+    "    sentenceDetector,\n",
+    "    tokenizer,\n",
+    "    posTagger,\n",
+    "    nerTagger,\n",
+    "    finisher\n",
+    "  ])\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load a dataset for prediction. Training is not relevant from this dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Load the input data to be annotated\n",
+    "data = spark. \\\n",
+    "        read. \\\n",
+    "        parquet(\"file:///\" + os.getcwd() + \"/../../../src/test/resources/sentiment.parquet\"). \\\n",
+    "        limit(1000)\n",
+    "data.cache()\n",
+    "data.count()\n",
+    "data.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Traing the model. Training doesn't really do anything from the dataset itself."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "start = time.time()\n",
+    "print(\"Start fitting\")\n",
+    "model = pipeline.fit(data)\n",
+    "print(\"Fitting is ended\")\n",
+    "print (time.time() - start)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run the prediction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ner_data = model.transform(data)\n",
+    "ner_data.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Save model and pipeline into disk after training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline.write().overwrite().save(\"./ner_pipeline\")\n",
+    "model.write().overwrite().save(\"./ner_model\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load the model and the pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark.ml import PipelineModel, Pipeline\n",
+    "\n",
+    "Pipeline.read().load(\"./ner_pipeline\")\n",
+    "sameModel = PipelineModel.read().load(\"./ner_model\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		Based on Spark-NLP 1.6.2

		Requires Spark-NLP installed from PIP and pyspark environment (pyspark or spark from pip)