-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.py
41 lines (32 loc) · 2.16 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# Import SimpleNamespace to create a simple object for storing configuration values
from types import SimpleNamespace
# Create a configuration object using SimpleNamespace to organize all model settings
CONFIG = SimpleNamespace(
# ========== Model Architecture Parameters ==========
# These settings define the structure of the neural network
n_layer=4, # Number of transformer layers in the model
n_head=4, # Number of attention heads in each transformer layer
d_model=256, # Dimension of embeddings (size of vectors used in the model)
d_ff=1024, # Dimension of feed-forward network's hidden layer
max_seq_len=128, # Maximum sequence length the model can process
# ========== Training Parameters ==========
# These settings control how the model learns from data
batch_size=8, # Number of samples processed in one training step
lr=2e-4, # Learning rate - how quickly the model updates its weights
weight_decay=0.1, # Regularization to prevent overfitting (penalize large weights)
epochs=150, # Number of complete passes through the training data
warmup_steps=100, # Gradually increase learning rate at start of training
grad_clip=1.0, # Prevent exploding gradients by clipping their maximum value
# ========== Dataset Parameters ==========
# Settings related to the training data
dataset_name="wikitext", # Name of the dataset to use
dataset_config="wikitext-2-raw-v1", # Specific version of the dataset
subset_size=500, # Optional: Use smaller subset of data for faster testing
# ========== Text Generation Parameters ==========
# Settings for how the model generates text
temperature=0.7, # Controls randomness: lower = more predictable, higher = more creative
top_k=50, # Only consider top K most likely next words during generation
)
# This configuration object helps keep all hyperparameters organized in one place
# The values shown are typical starting points for a small transformer model
# As you learn more about AI, you can experiment with adjusting these values!