Skip to content

Commit

Permalink
Initial base
Browse files Browse the repository at this point in the history
  • Loading branch information
saif-ellafi committed Sep 1, 2018
1 parent aa0b71d commit 20b8f14
Show file tree
Hide file tree
Showing 24 changed files with 4,153 additions and 0 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Based on Spark-NLP 1.6.2

Requires Spark-NLP installed from PIP and pyspark environment (pyspark or spark from pip)
303 changes: 303 additions & 0 deletions jupyter/crf-ner/ner.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,303 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Import appropriate modules"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"sys.path.append('../../')\n",
"\n",
"from pyspark.sql import SparkSession\n",
"from pyspark.ml import Pipeline\n",
"\n",
"from sparknlp.annotator import *\n",
"from sparknlp.common import *\n",
"from sparknlp.base import *\n",
"\n",
"import time\n",
"import zipfile"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Download training dataset if not already there"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Download CoNLL 2003 Dataset\n",
"import os\n",
"from pathlib import Path\n",
"import urllib.request\n",
"\n",
"if not Path(\"eng.train\").is_file():\n",
" url = \"https://github.com/patverga/torch-ner-nlp-from-scratch/raw/master/data/conll2003/eng.train\"\n",
" urllib.request.urlretrieve(url, 'eng.train')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Download Glove word embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Download Glove Word Embeddings\n",
"file = \"glove.6B.zip\"\n",
"if not Path(\"glove.6B.zip\").is_file():\n",
" url = \"http://nlp.stanford.edu/data/glove.6B.zip\"\n",
" print(\"Start downoading Glove Word Embeddings. It will take some time, please wait...\")\n",
" urllib.request.urlretrieve(url, \"glove.6B.zip\")\n",
" print(\"Downloading finished\")\n",
" \n",
"if not Path(\"glove.6B.100d.txt\").is_file():\n",
" zip_ref = zipfile.ZipFile(file, 'r')\n",
" zip_ref.extractall(\"./\")\n",
" zip_ref.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Start Spark"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"spark = SparkSession.builder \\\n",
" .appName(\"ner\")\\\n",
" .master(\"local[1]\")\\\n",
" .config(\"spark.driver.memory\",\"8G\")\\\n",
" .config(\"spark.driver.maxResultSize\", \"2G\") \\\n",
" .config(\"spark.driver.extraClassPath\", \"lib/sparknlp.jar\")\\\n",
" .config(\"spark.kryoserializer.buffer.max\", \"500m\")\\\n",
" .getOrCreate()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. Download CoNLL2003 dataset\n",
"2. Save 3 files eng.train, eng.testa, eng.testa, into working dir ./"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create annotator components in the right order, with their training Params. Finisher will output only NER. Put all in pipeline."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"documentAssembler = DocumentAssembler()\\\n",
" .setInputCol(\"text\")\\\n",
" .setOutputCol(\"document\")\n",
"\n",
"sentenceDetector = SentenceDetector()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"sentence\")\n",
"\n",
"tokenizer = Tokenizer()\\\n",
" .setInputCols([\"document\"])\\\n",
" .setOutputCol(\"token\")\n",
"\n",
"posTagger = PerceptronApproach()\\\n",
" .setIterations(5)\\\n",
" .setInputCols([\"token\", \"document\"])\\\n",
" .setOutputCol(\"pos\")\\\n",
" .setCorpus(\"file:///\" + os.getcwd() + \"/../../../src/test/resources/anc-pos-corpus-small/\", \"|\")\n",
"\n",
"nerTagger = NerCrfApproach()\\\n",
" .setInputCols([\"sentence\", \"token\", \"pos\"])\\\n",
" .setLabelColumn(\"label\")\\\n",
" .setOutputCol(\"ner\")\\\n",
" .setMinEpochs(1)\\\n",
" .setMaxEpochs(5)\\\n",
" .setLossEps(1e-3)\\\n",
" .setEmbeddingsSource(\"glove.6B.100d.txt\", 100, 2)\\\n",
" .setExternalFeatures(\"file:///\" + os.getcwd() + \"/../../../src/test/resources/ner-corpus/dict.txt\", \",\")\\\n",
" .setExternalDataset(\"file:///\" + os.getcwd() + \"/eng.train\")\\\n",
" .setL2(1)\\\n",
" .setC0(1250000)\\\n",
" .setRandomSeed(0)\\\n",
" .setVerbose(2)\n",
"\n",
"finisher = Finisher() \\\n",
" .setInputCols([\"ner\"]) \\\n",
" .setIncludeKeys(True)\n",
"\n",
"pipeline = Pipeline(\n",
" stages = [\n",
" documentAssembler,\n",
" sentenceDetector,\n",
" tokenizer,\n",
" posTagger,\n",
" nerTagger,\n",
" finisher\n",
" ])\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load a dataset for prediction. Training is not relevant from this dataset."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Load the input data to be annotated\n",
"data = spark. \\\n",
" read. \\\n",
" parquet(\"file:///\" + os.getcwd() + \"/../../../src/test/resources/sentiment.parquet\"). \\\n",
" limit(1000)\n",
"data.cache()\n",
"data.count()\n",
"data.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Traing the model. Training doesn't really do anything from the dataset itself."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"start = time.time()\n",
"print(\"Start fitting\")\n",
"model = pipeline.fit(data)\n",
"print(\"Fitting is ended\")\n",
"print (time.time() - start)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Run the prediction"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ner_data = model.transform(data)\n",
"ner_data.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Save model and pipeline into disk after training"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline.write().overwrite().save(\"./ner_pipeline\")\n",
"model.write().overwrite().save(\"./ner_model\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load the model and the pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"from pyspark.ml import PipelineModel, Pipeline\n",
"\n",
"Pipeline.read().load(\"./ner_pipeline\")\n",
"sameModel = PipelineModel.read().load(\"./ner_model\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Loading

0 comments on commit 20b8f14

Please sign in to comment.