Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[doc] add examples and minor updates #1071

Merged
merged 15 commits into from
Mar 24, 2025
Merged
Next Next commit
add examples and minor updates
  • Loading branch information
zyaoj committed Feb 27, 2025
commit 8d842972da4cbd016dac62ec711f51dcf6cd1992
1 change: 1 addition & 0 deletions doc/source/index.rst
Original file line number Diff line number Diff line change
@@ -71,6 +71,7 @@ Documentation

notebooks/datapipeline
notebooks/dataset_gsm8k_sft
notebooks/models/load_model

.. toctree::
:maxdepth: 1
26 changes: 12 additions & 14 deletions doc/source/notebooks/dataset_gsm8k_sft.ipynb
Original file line number Diff line number Diff line change
@@ -34,7 +34,7 @@
" load_text_tokenizer,\n",
" setup_gangs,\n",
")\n",
"from fairseq2.recipes.config import GangSection\n",
"from fairseq2.recipes.config import GangSection, ModelSection\n",
"from fairseq2.recipes.lm import InstructionFinetuneDatasetSection\n",
"from fairseq2.datasets.instruction import (\n",
" InstructionDataset,\n",
@@ -88,9 +88,17 @@
"# prepare the seed\n",
"seed = 42\n",
"\n",
"# prepare the gang\n",
"gangs = setup_gangs(context, GangSection(tensor_parallel_size=5))\n",
"dataset = load_dataset(InstructionDataset, context, dataset_config, gangs)"
"class Config(object):\n",
" pass\n",
"\n",
"config = Config() # instantiate an object\n",
"config.gang = GangSection(tensor_parallel_size=1)\n",
"config.dataset = dataset_config\n",
"config.model = ModelSection(name=\"llama3_1_8b\")\n",
"gangs = setup_gangs(context, config)\n",
"dataset = load_dataset(InstructionDataset, context, config, gangs)\n",
"# load the tokenizer\n",
"tokenizer = load_text_tokenizer(context, config)"
]
},
{
@@ -119,16 +127,6 @@
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# load the tokenizer\n",
"tokenizer = load_text_tokenizer(context, \"llama3_1_8b\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
356 changes: 356 additions & 0 deletions doc/source/notebooks/models/load_model.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,356 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ✎ Load Model\n",
"\n",
"## Overview\n",
"\n",
"This notebook aims at illustrating on how to instantiate models in fairseq2."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from fairseq2 import setup_fairseq2\n",
"\n",
"# Always call setup_fairseq2() before using any fairseq2 functionality\n",
"setup_fairseq2()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"All models in fairseq2 inherit from PyTorch's `nn.Module`, providing standard PyTorch funtionality. The configuration can be easily customized."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TransformerDecoderModel(\n",
" model_dim=2048\n",
" (decoder_frontend): TransformerEmbeddingFrontend(\n",
" model_dim=2048\n",
" (embed): StandardEmbedding(num_embeddings=32000, embedding_dim=2048)\n",
" (pos_encoder): None\n",
" (layer_norm): None\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (decoder): StandardTransformerDecoder(\n",
" model_dim=2048, self_attn_mask_factory=CausalAttentionMaskFactory(), norm_order=PRE\n",
" (layers): ModuleList(\n",
" (0-15): 16 x StandardTransformerDecoderLayer(\n",
" model_dim=2048, norm_order=PRE\n",
" (self_attn_layer_norm): RMSNorm(normalized_shape=(2048,), eps=1E-05, elementwise_affine=True)\n",
" (self_attn): StandardMultiheadAttention(\n",
" num_heads=32, model_dim=2048, num_key_value_heads=8\n",
" (q_proj): Linear(input_dim=2048, output_dim=2048, bias=False, init_fn=init_qkv_projection)\n",
" (k_proj): Linear(input_dim=2048, output_dim=512, bias=False, init_fn=init_qkv_projection)\n",
" (v_proj): Linear(input_dim=2048, output_dim=512, bias=False, init_fn=init_qkv_projection)\n",
" (pos_encoder): RotaryEncoder(encoding_dim=64, max_seq_len=4096)\n",
" (sdpa): TorchSDPA(attn_dropout_p=0.1)\n",
" (output_proj): Linear(input_dim=2048, output_dim=2048, bias=False, init_fn=init_output_projection)\n",
" )\n",
" (self_attn_norm): None\n",
" (self_attn_dropout): None\n",
" (self_attn_residual): StandardResidualConnect()\n",
" (encoder_decoder_attn): None\n",
" (encoder_decoder_attn_dropout): None\n",
" (encoder_decoder_attn_residual): None\n",
" (encoder_decoder_attn_layer_norm): None\n",
" (ffn_layer_norm): RMSNorm(normalized_shape=(2048,), eps=1E-05, elementwise_affine=True)\n",
" (ffn): GLUFeedForwardNetwork(\n",
" model_dim=2048, inner_dim_scale=0.666667, inner_dim_to_multiple=256\n",
" (gate_proj): Linear(input_dim=2048, output_dim=5632, bias=False)\n",
" (gate_activation): SiLU()\n",
" (inner_proj): Linear(input_dim=2048, output_dim=5632, bias=False)\n",
" (inner_dropout): Dropout(p=0.1, inplace=False)\n",
" (output_proj): Linear(input_dim=5632, output_dim=2048, bias=False)\n",
" )\n",
" (ffn_dropout): None\n",
" (ffn_residual): StandardResidualConnect()\n",
" )\n",
" )\n",
" (layer_norm): RMSNorm(normalized_shape=(2048,), eps=1E-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (final_proj): Linear(input_dim=2048, output_dim=32000, bias=False, init_fn=init_final_projection)\n",
")"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fairseq2.models.llama import LLaMAConfig, create_llama_model\n",
"from fairseq2.data import VocabularyInfo\n",
"\n",
"custom_config = LLaMAConfig(\n",
" model_dim=2048, # Model dimension\n",
" max_seq_len=4096, # Maximum sequence length\n",
" vocab_info=VocabularyInfo(\n",
" size=32000, # Vocabulary size\n",
" unk_idx=0, # Unknown index\n",
" bos_idx=1, # Beginning of sequence index\n",
" eos_idx=2, # End of sequence index\n",
" pad_idx=None # Padding index\n",
" ),\n",
" num_layers=16, # Number of transformer layers\n",
" num_attn_heads=32, # Number of attention heads\n",
" num_key_value_heads=8, # Number of key/value heads\n",
" ffn_inner_dim=2048 * 4, # FFN inner dimension\n",
" dropout_p=0.1 # Dropout probability\n",
")\n",
"\n",
"model = create_llama_model(custom_config)\n",
"model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can also fetch some config presets from model hub."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TransformerDecoderModel(\n",
" model_dim=4096\n",
" (decoder_frontend): TransformerEmbeddingFrontend(\n",
" model_dim=4096\n",
" (embed): StandardEmbedding(num_embeddings=128256, embedding_dim=4096)\n",
" (pos_encoder): None\n",
" (layer_norm): None\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (decoder): StandardTransformerDecoder(\n",
" model_dim=4096, self_attn_mask_factory=CausalAttentionMaskFactory(), norm_order=PRE\n",
" (layers): ModuleList(\n",
" (0-31): 32 x StandardTransformerDecoderLayer(\n",
" model_dim=4096, norm_order=PRE\n",
" (self_attn_layer_norm): RMSNorm(normalized_shape=(4096,), eps=1E-05, elementwise_affine=True)\n",
" (self_attn): StandardMultiheadAttention(\n",
" num_heads=32, model_dim=4096, num_key_value_heads=8\n",
" (q_proj): Linear(input_dim=4096, output_dim=4096, bias=False, init_fn=init_qkv_projection)\n",
" (k_proj): Linear(input_dim=4096, output_dim=1024, bias=False, init_fn=init_qkv_projection)\n",
" (v_proj): Linear(input_dim=4096, output_dim=1024, bias=False, init_fn=init_qkv_projection)\n",
" (pos_encoder): RotaryEncoder(encoding_dim=128, max_seq_len=131072)\n",
" (sdpa): TorchSDPA(attn_dropout_p=0.1)\n",
" (output_proj): Linear(input_dim=4096, output_dim=4096, bias=False, init_fn=init_output_projection)\n",
" )\n",
" (self_attn_norm): None\n",
" (self_attn_dropout): None\n",
" (self_attn_residual): StandardResidualConnect()\n",
" (encoder_decoder_attn): None\n",
" (encoder_decoder_attn_dropout): None\n",
" (encoder_decoder_attn_residual): None\n",
" (encoder_decoder_attn_layer_norm): None\n",
" (ffn_layer_norm): RMSNorm(normalized_shape=(4096,), eps=1E-05, elementwise_affine=True)\n",
" (ffn): GLUFeedForwardNetwork(\n",
" model_dim=4096, inner_dim_scale=0.666667, inner_dim_to_multiple=1024\n",
" (gate_proj): Linear(input_dim=4096, output_dim=14336, bias=False)\n",
" (gate_activation): SiLU()\n",
" (inner_proj): Linear(input_dim=4096, output_dim=14336, bias=False)\n",
" (inner_dropout): Dropout(p=0.1, inplace=False)\n",
" (output_proj): Linear(input_dim=14336, output_dim=4096, bias=False)\n",
" )\n",
" (ffn_dropout): None\n",
" (ffn_residual): StandardResidualConnect()\n",
" )\n",
" )\n",
" (layer_norm): RMSNorm(normalized_shape=(4096,), eps=1E-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (final_proj): Linear(input_dim=4096, output_dim=128256, bias=False, init_fn=init_final_projection)\n",
")"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fairseq2.models.llama import get_llama_model_hub, create_llama_model\n",
"\n",
"model_hub = get_llama_model_hub()\n",
"model_config = model_hub.load_config(\"llama3_1_8b_instruct\") # use llama3.1 8b preset as an example\n",
"\n",
"llama_model = create_llama_model(model_config)\n",
"llama_model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To check what are the registered models, we can leverage the `asset_store` in our runtime context."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from fairseq2.context import get_runtime_context\n",
"context = get_runtime_context()\n",
"asset_store = context.asset_store"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['llama3_1_8b@',\n",
" 'llama3_1_8b_instruct@',\n",
" 'llama3_1_70b@',\n",
" 'llama3_1_70b_instruct@',\n",
" 'llama3_1_8b@awscluster',\n",
" 'llama3_1_8b@aws-h100-2',\n",
" 'llama3_1_8b_instruct@faircluster',\n",
" 'llama3_1_8b_instruct@awscluster',\n",
" 'llama3_1_8b_instruct@aws-h100-2',\n",
" 'llama3_1_70b@awscluster',\n",
" 'llama3_1_70b@aws-h100-2',\n",
" 'llama3_1_70b_instruct@faircluster',\n",
" 'llama3_1_70b_instruct@awscluster',\n",
" 'llama3_1_70b_instruct@aws-h100-2']"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[asset for asset in asset_store.retrieve_names() if \"llama3_1\" in asset]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Loading pretrained model can also be done directly from the hub."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TransformerDecoderModel(\n",
" model_dim=2048\n",
" (decoder_frontend): TransformerEmbeddingFrontend(\n",
" model_dim=2048\n",
" (embed): StandardEmbedding(num_embeddings=128256, embedding_dim=2048)\n",
" (pos_encoder): None\n",
" (layer_norm): None\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (decoder): StandardTransformerDecoder(\n",
" model_dim=2048, self_attn_mask_factory=CausalAttentionMaskFactory(), norm_order=PRE\n",
" (layers): ModuleList(\n",
" (0-15): 16 x StandardTransformerDecoderLayer(\n",
" model_dim=2048, norm_order=PRE\n",
" (self_attn_layer_norm): RMSNorm(normalized_shape=(2048,), eps=1E-05, elementwise_affine=True)\n",
" (self_attn): StandardMultiheadAttention(\n",
" num_heads=32, model_dim=2048, num_key_value_heads=8\n",
" (q_proj): Linear(input_dim=2048, output_dim=2048, bias=False, init_fn=init_qkv_projection)\n",
" (k_proj): Linear(input_dim=2048, output_dim=512, bias=False, init_fn=init_qkv_projection)\n",
" (v_proj): Linear(input_dim=2048, output_dim=512, bias=False, init_fn=init_qkv_projection)\n",
" (pos_encoder): RotaryEncoder(encoding_dim=64, max_seq_len=131072)\n",
" (sdpa): TorchSDPA(attn_dropout_p=0.1)\n",
" (output_proj): Linear(input_dim=2048, output_dim=2048, bias=False, init_fn=init_output_projection)\n",
" )\n",
" (self_attn_norm): None\n",
" (self_attn_dropout): None\n",
" (self_attn_residual): StandardResidualConnect()\n",
" (encoder_decoder_attn): None\n",
" (encoder_decoder_attn_dropout): None\n",
" (encoder_decoder_attn_residual): None\n",
" (encoder_decoder_attn_layer_norm): None\n",
" (ffn_layer_norm): RMSNorm(normalized_shape=(2048,), eps=1E-05, elementwise_affine=True)\n",
" (ffn): GLUFeedForwardNetwork(\n",
" model_dim=2048, inner_dim_scale=0.666667, inner_dim_to_multiple=256\n",
" (gate_proj): Linear(input_dim=2048, output_dim=8192, bias=False)\n",
" (gate_activation): SiLU()\n",
" (inner_proj): Linear(input_dim=2048, output_dim=8192, bias=False)\n",
" (inner_dropout): Dropout(p=0.1, inplace=False)\n",
" (output_proj): Linear(input_dim=8192, output_dim=2048, bias=False)\n",
" )\n",
" (ffn_dropout): None\n",
" (ffn_residual): StandardResidualConnect()\n",
" )\n",
" )\n",
" (layer_norm): RMSNorm(normalized_shape=(2048,), eps=1E-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (final_proj): Linear(input_dim=2048, output_dim=128256, bias=False, init_fn=init_final_projection)\n",
")"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fairseq2.models.llama import get_llama_model_hub\n",
"\n",
"model_hub = get_llama_model_hub()\n",
"# Load a pre-trained model from the hub\n",
"model = model_hub.load(\"llama3_2_1b\") # here llama3_2_1b needs to be a registered asset card\n",
"model"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
2 changes: 1 addition & 1 deletion doc/source/tutorials/preference_optimization.rst
Original file line number Diff line number Diff line change
@@ -209,7 +209,7 @@ fairseq2 supports four different preference optimization methods:
_set_:
version: v1
granularity: layer
hsdp: false
hybrid: false
reshard_after_forward: true
fp32_reduce: true
optimizer: