Skip to content

Commit 3c90fd8

Browse files
committed
nlp framework init
0 parents  commit 3c90fd8

31 files changed

+1760
-0
lines changed

.gitignore

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
pip-wheel-metadata/
24+
share/python-wheels/
25+
*.egg-info/
26+
.installed.cfg
27+
*.egg
28+
MANIFEST
29+
30+
# PyInstaller
31+
# Usually these files are written by a python script from a template
32+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
33+
*.manifest
34+
*.spec
35+
36+
# Installer logs
37+
pip-log.txt
38+
pip-delete-this-directory.txt
39+
40+
# Unit test / coverage reports
41+
htmlcov/
42+
.tox/
43+
.nox/
44+
.coverage
45+
.coverage.*
46+
.cache
47+
nosetests.xml
48+
coverage.xml
49+
*.cover
50+
*.py,cover
51+
.hypothesis/
52+
.pytest_cache/
53+
54+
# Translations
55+
*.mo
56+
*.pot
57+
58+
# Django stuff:
59+
*.log
60+
local_settings.py
61+
db.sqlite3
62+
db.sqlite3-journal
63+
64+
# Flask stuff:
65+
instance/
66+
.webassets-cache
67+
68+
# Scrapy stuff:
69+
.scrapy
70+
71+
# Sphinx documentation
72+
docs/_build/
73+
74+
# PyBuilder
75+
target/
76+
77+
# Jupyter Notebook
78+
.ipynb_checkpoints
79+
80+
# IPython
81+
profile_default/
82+
ipython_config.py
83+
84+
# pyenv
85+
.python-version
86+
87+
# pipenv
88+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
90+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
91+
# install all needed dependencies.
92+
#Pipfile.lock
93+
94+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
95+
__pypackages__/
96+
97+
# Celery stuff
98+
celerybeat-schedule
99+
celerybeat.pid
100+
101+
# SageMath parsed files
102+
*.sage.py
103+
104+
# Environments
105+
.env
106+
.venv
107+
env/
108+
venv/
109+
ENV/
110+
env.bak/
111+
venv.bak/
112+
113+
# Spyder project settings
114+
.spyderproject
115+
.spyproject
116+
117+
# Rope project settings
118+
.ropeproject
119+
120+
# mkdocs documentation
121+
/site
122+
123+
# mypy
124+
.mypy_cache/
125+
.dmypy.json
126+
dmypy.json
127+
128+
# Pyre type checker
129+
.pyre/
130+
.idea
131+
132+
nlp_data/

Dockerfile

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
FROM tensorflow/tensorflow:2.9.1
2+
3+
ENV HOME=/app
4+
ENV DATA_DIR='/nlp_data'
5+
ENV CUDA_VISIBLE_DEVICES=1
6+
ENV NLTK_DATA=/app/nltk_data
7+
8+
COPY . ${HOME}
9+
10+
RUN set -eux; \
11+
python --version
12+
RUN set -eux; \
13+
python -m pip install -U pip
14+
15+
RUN pip install joblib~=1.1.0
16+
RUN pip install sklearn~=0.0
17+
RUN pip install scikit-learn~=1.1.1
18+
RUN pip install nltk~=3.7
19+
RUN pip install PyYAML~=6.0
20+
21+
RUN set -eux; \
22+
python -m nltk.downloader stopwords
23+
24+
WORKDIR ${HOME}
25+
26+
ENTRYPOINT [ "python", "main.py" ]

LICENSE.txt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
MIT License
2+
Copyright (c) 2023 Smartloop Inc
3+
Permission is hereby granted, free of charge, to any person obtaining a copy
4+
of this software and associated documentation files (the "Software"), to deal
5+
in the Software without restriction, including without limitation the rights
6+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7+
copies of the Software, and to permit persons to whom the Software is
8+
furnished to do so, subject to the following conditions:
9+
The above copyright notice and this permission notice shall be included in all
10+
copies or substantial portions of the Software.
11+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17+
SOFTWARE.

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
include data/sample.json

README.md

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# Smartloop NLU Framework
2+
Natural language processing framework
3+
4+
# Train a bot
5+
6+
Use the `sample.json` file in the `\data` folder, you will pass the name of bot as an argument in the next step.
7+
8+
Below is as training JSON sample containing the pattern and name of the intent that wil be resolved for a user input.
9+
10+
```json
11+
{
12+
"examples": {
13+
"intents": [
14+
{
15+
"text": "about",
16+
"intent": "about"
17+
},
18+
{
19+
"text": "company",
20+
"intent": "about"
21+
},
22+
{
23+
"text": "what is smartloop",
24+
"intent": "about"
25+
},
26+
{
27+
"text": "start",
28+
"intent": "start"
29+
},
30+
{
31+
"text": "menu",
32+
"intent": "start"
33+
},
34+
{
35+
"text": "hi",
36+
"intent": "start"
37+
}
38+
]
39+
},
40+
"lang": "en"
41+
}
42+
```
43+
44+
From the command line type the following to train the bot:
45+
46+
```
47+
python main.py train -i sample
48+
49+
```
50+
51+
Testing the bot
52+
53+
To test the type the following command:
54+
55+
```
56+
python main.py parse -i sample -t "I need a chabot"
57+
```
58+
59+
This should return the intent name followed by the confidence level
60+
61+
```
62+
{
63+
"topIntent": {
64+
"intent": "i-need-chatbot",
65+
"confidence": 0.9999436140060425
66+
},
67+
"intents": [
68+
{
69+
"intent": "i-need-chatbot",
70+
"confidence": 0.9999436140060425
71+
},
72+
{
73+
"intent": "chatter-good-afternoon",
74+
"confidence": 4.835660001845099e-05
75+
},
76+
{
77+
"intent": "bizbot-no-way",
78+
"confidence": 3.6056665067008e-06
79+
},
80+
{
81+
"intent": "about-chatbot",
82+
"confidence": 1.9573460576793877e-06
83+
},
84+
{
85+
"intent": "contact",
86+
"confidence": 1.095663265004987e-06
87+
}
88+
]
89+
}
90+
```
91+
92+
## Tunning your model (Advanced)
93+
94+
It is possible to override the default training parameters to create a model that fits your need, override `config.yaml` to tune your model:
95+
96+
```yaml
97+
# number of epochs
98+
epochs: 100
99+
100+
# Use tensorboard callback
101+
logs: True
102+
103+
# classifier parameters
104+
embedded_intent_classifier:
105+
# base neurons, this will be increased based on the intent size
106+
neurons: 16
107+
# length of input len("hello how are you") = 4
108+
input_length: 100
109+
learning_rate: 1e-2
110+
flatten: False
111+
hidden_layers: 2
112+
# drop rate to avoid overfitting
113+
drop_rate: 0.2
114+
# early stop training in case of not improving
115+
early_stopping: True
116+
```
117+
118+
This can vary based on model size, can be tuned using the grid search capabablites to find the optimal settings.
119+
120+
Here is a list of basic parameters and their meaning:
121+
122+
* epochs - This is the number of iterations where 1 epoch = 1 complete neural net cycle
123+
* learning_rate - How fast or slow, the model is learning through iterations
124+
* drop_rate - Adjust to prevent overfitting of the data to fine tune your model
125+
126+
127+
## Configuration
128+
129+
Install stop words dictionary using following command
130+
131+
```
132+
python -m nltk.downloader stopwords
133+
```
134+
135+
## Debugging
136+
137+
Set `logs:True` in config.yaml to enable debugging using `tensorboard`. Once you have trained the bot. Type the following command to start tensorboard:
138+
139+
```commandline
140+
tensorboard serve --logdir logs/nlp_data/<bot_id>/<model_id>
141+
```
142+
143+
144+
## Requirements
145+
146+
* Tensorflow (>=2.9.1)
147+
148+
## License
149+
Licensed under the Apache License, Version 2.0.
150+
151+
Copyright 2021-2022 Smartloop Inc.

config.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# number of epochs
2+
epochs: 100
3+
4+
# Use tensorboard callback
5+
logs: True
6+
7+
# classifier parameters
8+
embedded_intent_classifier:
9+
# base neurons to be used by LSTM model
10+
neurons: 32
11+
# length of input len("hello how are you") = 4
12+
input_length: 100
13+
# learning rate
14+
learning_rate: 1e-2
15+
# flatten
16+
flatten: False
17+
# number of hidden layer
18+
hidden_layers: 1
19+
# drop rate to avoid overfitting
20+
drop_rate: 0.5
21+
# early stop training in case of not improving
22+
early_stopping: True

0 commit comments

Comments
 (0)