forked from JohnSnowLabs/spark-nlp-workshop
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
aa0b71d
commit 20b8f14
Showing
24 changed files
with
4,153 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Based on Spark-NLP 1.6.2 | ||
|
||
Requires Spark-NLP installed from PIP and pyspark environment (pyspark or spark from pip) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,303 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Import appropriate modules" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import sys\n", | ||
"sys.path.append('../../')\n", | ||
"\n", | ||
"from pyspark.sql import SparkSession\n", | ||
"from pyspark.ml import Pipeline\n", | ||
"\n", | ||
"from sparknlp.annotator import *\n", | ||
"from sparknlp.common import *\n", | ||
"from sparknlp.base import *\n", | ||
"\n", | ||
"import time\n", | ||
"import zipfile" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Download training dataset if not already there" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Download CoNLL 2003 Dataset\n", | ||
"import os\n", | ||
"from pathlib import Path\n", | ||
"import urllib.request\n", | ||
"\n", | ||
"if not Path(\"eng.train\").is_file():\n", | ||
" url = \"https://github.com/patverga/torch-ner-nlp-from-scratch/raw/master/data/conll2003/eng.train\"\n", | ||
" urllib.request.urlretrieve(url, 'eng.train')\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Download Glove word embeddings" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Download Glove Word Embeddings\n", | ||
"file = \"glove.6B.zip\"\n", | ||
"if not Path(\"glove.6B.zip\").is_file():\n", | ||
" url = \"http://nlp.stanford.edu/data/glove.6B.zip\"\n", | ||
" print(\"Start downoading Glove Word Embeddings. It will take some time, please wait...\")\n", | ||
" urllib.request.urlretrieve(url, \"glove.6B.zip\")\n", | ||
" print(\"Downloading finished\")\n", | ||
" \n", | ||
"if not Path(\"glove.6B.100d.txt\").is_file():\n", | ||
" zip_ref = zipfile.ZipFile(file, 'r')\n", | ||
" zip_ref.extractall(\"./\")\n", | ||
" zip_ref.close()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Start Spark" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"spark = SparkSession.builder \\\n", | ||
" .appName(\"ner\")\\\n", | ||
" .master(\"local[1]\")\\\n", | ||
" .config(\"spark.driver.memory\",\"8G\")\\\n", | ||
" .config(\"spark.driver.maxResultSize\", \"2G\") \\\n", | ||
" .config(\"spark.driver.extraClassPath\", \"lib/sparknlp.jar\")\\\n", | ||
" .config(\"spark.kryoserializer.buffer.max\", \"500m\")\\\n", | ||
" .getOrCreate()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"1. Download CoNLL2003 dataset\n", | ||
"2. Save 3 files eng.train, eng.testa, eng.testa, into working dir ./" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Create annotator components in the right order, with their training Params. Finisher will output only NER. Put all in pipeline." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"documentAssembler = DocumentAssembler()\\\n", | ||
" .setInputCol(\"text\")\\\n", | ||
" .setOutputCol(\"document\")\n", | ||
"\n", | ||
"sentenceDetector = SentenceDetector()\\\n", | ||
" .setInputCols([\"document\"])\\\n", | ||
" .setOutputCol(\"sentence\")\n", | ||
"\n", | ||
"tokenizer = Tokenizer()\\\n", | ||
" .setInputCols([\"document\"])\\\n", | ||
" .setOutputCol(\"token\")\n", | ||
"\n", | ||
"posTagger = PerceptronApproach()\\\n", | ||
" .setIterations(5)\\\n", | ||
" .setInputCols([\"token\", \"document\"])\\\n", | ||
" .setOutputCol(\"pos\")\\\n", | ||
" .setCorpus(\"file:///\" + os.getcwd() + \"/../../../src/test/resources/anc-pos-corpus-small/\", \"|\")\n", | ||
"\n", | ||
"nerTagger = NerCrfApproach()\\\n", | ||
" .setInputCols([\"sentence\", \"token\", \"pos\"])\\\n", | ||
" .setLabelColumn(\"label\")\\\n", | ||
" .setOutputCol(\"ner\")\\\n", | ||
" .setMinEpochs(1)\\\n", | ||
" .setMaxEpochs(5)\\\n", | ||
" .setLossEps(1e-3)\\\n", | ||
" .setEmbeddingsSource(\"glove.6B.100d.txt\", 100, 2)\\\n", | ||
" .setExternalFeatures(\"file:///\" + os.getcwd() + \"/../../../src/test/resources/ner-corpus/dict.txt\", \",\")\\\n", | ||
" .setExternalDataset(\"file:///\" + os.getcwd() + \"/eng.train\")\\\n", | ||
" .setL2(1)\\\n", | ||
" .setC0(1250000)\\\n", | ||
" .setRandomSeed(0)\\\n", | ||
" .setVerbose(2)\n", | ||
"\n", | ||
"finisher = Finisher() \\\n", | ||
" .setInputCols([\"ner\"]) \\\n", | ||
" .setIncludeKeys(True)\n", | ||
"\n", | ||
"pipeline = Pipeline(\n", | ||
" stages = [\n", | ||
" documentAssembler,\n", | ||
" sentenceDetector,\n", | ||
" tokenizer,\n", | ||
" posTagger,\n", | ||
" nerTagger,\n", | ||
" finisher\n", | ||
" ])\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Load a dataset for prediction. Training is not relevant from this dataset." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#Load the input data to be annotated\n", | ||
"data = spark. \\\n", | ||
" read. \\\n", | ||
" parquet(\"file:///\" + os.getcwd() + \"/../../../src/test/resources/sentiment.parquet\"). \\\n", | ||
" limit(1000)\n", | ||
"data.cache()\n", | ||
"data.count()\n", | ||
"data.show()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Traing the model. Training doesn't really do anything from the dataset itself." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"scrolled": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"start = time.time()\n", | ||
"print(\"Start fitting\")\n", | ||
"model = pipeline.fit(data)\n", | ||
"print(\"Fitting is ended\")\n", | ||
"print (time.time() - start)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Run the prediction" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"ner_data = model.transform(data)\n", | ||
"ner_data.show()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Save model and pipeline into disk after training" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"pipeline.write().overwrite().save(\"./ner_pipeline\")\n", | ||
"model.write().overwrite().save(\"./ner_model\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Load the model and the pipeline" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"scrolled": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from pyspark.ml import PipelineModel, Pipeline\n", | ||
"\n", | ||
"Pipeline.read().load(\"./ner_pipeline\")\n", | ||
"sameModel = PipelineModel.read().load(\"./ner_model\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"anaconda-cloud": {}, | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 1 | ||
} |
Oops, something went wrong.