Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: instructlab/GPTDolomite
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: v0.0.1
Choose a base ref
...
head repository: instructlab/GPTDolomite
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: main
Choose a head ref
  • 4 commits
  • 60 files changed
  • 3 contributors

Commits on Jul 9, 2024

  1. Update Llama config conversion to include architectures field (#5)

    Signed-off-by: Mustafa Eyceoz <[email protected]>
    Maxusmusti authored Jul 9, 2024

    Verified

    This commit was created on GitHub.com and signed with GitHub’s verified signature.
    Copy the full SHA
    541403f View commit details

Commits on Jul 10, 2024

  1. dynamic version (#6)

    Signed-off-by: Yu Chin Fabian Lim <[email protected]>
    fabianlim authored Jul 10, 2024

    Verified

    This commit was created on GitHub.com and signed with GitHub’s verified signature.
    Copy the full SHA
    ddfc6b1 View commit details

Commits on Jul 18, 2024

  1. rope positions need higher precision (#7)

    Signed-off-by: Yu Chin Fabian Lim <[email protected]>
    fabianlim authored Jul 18, 2024

    Verified

    This commit was created on GitHub.com and signed with GitHub’s verified signature.
    Copy the full SHA
    da678a5 View commit details

Commits on Nov 1, 2024

  1. Update for Recent Changes and Granite Model Class Support (#11)

    * Update for granite model class support
    
    Signed-off-by: Mustafa Eyceoz <[email protected]>
    
    * Add mixins
    
    Signed-off-by: Mustafa Eyceoz <[email protected]>
    
    * Removing rmsnorm options to avoid optional checks
    
    Signed-off-by: Mustafa Eyceoz <[email protected]>
    
    * Remove TP import
    
    Signed-off-by: Mustafa Eyceoz <[email protected]>
    
    * Add config init
    
    Signed-off-by: Mustafa Eyceoz <[email protected]>
    
    * Remove granite moe
    
    Signed-off-by: Mustafa Eyceoz <[email protected]>
    
    * Remove mixtral
    
    Signed-off-by: Mustafa Eyceoz <[email protected]>
    
    * Remove excess register stuff
    
    Signed-off-by: Mustafa Eyceoz <[email protected]>
    
    ---------
    
    Signed-off-by: Mustafa Eyceoz <[email protected]>
    Maxusmusti authored Nov 1, 2024

    Verified

    This commit was created on GitHub.com and signed with GitHub’s verified signature.
    Copy the full SHA
    5fca4cc View commit details
Showing with 3,061 additions and 1,839 deletions.
  1. +6 −2 pyproject.toml
  2. +80 −0 src/instructlab/dolomite/enums.py
  3. +1 −1 src/instructlab/dolomite/hf_models/__init__.py
  4. +12 −31 src/instructlab/dolomite/hf_models/config.py
  5. +1 −0 src/instructlab/dolomite/hf_models/defaults.py
  6. +5 −4 src/instructlab/dolomite/hf_models/enums.py
  7. +4 −0 src/instructlab/dolomite/hf_models/mixins/__init__.py
  8. +2 −0 src/instructlab/dolomite/hf_models/mixins/dense/__init__.py
  9. +584 −0 src/instructlab/dolomite/hf_models/mixins/dense/base.py
  10. +198 −0 src/instructlab/dolomite/hf_models/mixins/dense/main.py
  11. +2 −0 src/instructlab/dolomite/hf_models/mixins/dense_TP/__init__.py
  12. +104 −0 src/instructlab/dolomite/hf_models/mixins/dense_TP/base.py
  13. +196 −0 src/instructlab/dolomite/hf_models/mixins/dense_TP/main.py
  14. +2 −0 src/instructlab/dolomite/hf_models/mixins/moe/__init__.py
  15. +205 −0 src/instructlab/dolomite/hf_models/mixins/moe/base.py
  16. +95 −0 src/instructlab/dolomite/hf_models/mixins/moe/main.py
  17. +2 −0 src/instructlab/dolomite/hf_models/mixins/moe_TP/__init__.py
  18. +75 −0 src/instructlab/dolomite/hf_models/mixins/moe_TP/base.py
  19. +89 −0 src/instructlab/dolomite/hf_models/mixins/moe_TP/main.py
  20. +7 −14 src/instructlab/dolomite/hf_models/model_conversion/__init__.py
  21. +14 −28 src/instructlab/dolomite/hf_models/model_conversion/bigcode.py
  22. +143 −0 src/instructlab/dolomite/hf_models/model_conversion/granite.py
  23. +277 −0 src/instructlab/dolomite/hf_models/model_conversion/granitemoe.py
  24. +90 −185 src/instructlab/dolomite/hf_models/model_conversion/llama.py
  25. +4 −7 src/instructlab/dolomite/hf_models/modeling_utils/__init__.py
  26. +2 −7 src/instructlab/dolomite/hf_models/modeling_utils/activations/__init__.py
  27. +2 −9 src/instructlab/dolomite/hf_models/modeling_utils/activations/base.py
  28. +6 −9 src/instructlab/dolomite/hf_models/modeling_utils/activations/glu.py
  29. +0 −37 src/instructlab/dolomite/hf_models/modeling_utils/activations/math_gelu.py
  30. +5 −14 src/instructlab/dolomite/hf_models/modeling_utils/attention/__init__.py
  31. +57 −85 src/instructlab/dolomite/hf_models/modeling_utils/attention/base.py
  32. +17 −93 src/instructlab/dolomite/hf_models/modeling_utils/attention/flash.py
  33. +12 −27 src/instructlab/dolomite/hf_models/modeling_utils/attention/padding_free.py
  34. +7 −14 src/instructlab/dolomite/hf_models/modeling_utils/attention/sdpa.py
  35. +7 −39 src/instructlab/dolomite/hf_models/modeling_utils/attention/utils.py
  36. +44 −0 src/instructlab/dolomite/hf_models/modeling_utils/embedding.py
  37. +50 −0 src/instructlab/dolomite/hf_models/modeling_utils/linear.py
  38. +7 −14 src/instructlab/dolomite/hf_models/modeling_utils/normalization/__init__.py
  39. +22 −0 src/instructlab/dolomite/hf_models/modeling_utils/normalization/layernorm/__init__.py
  40. +33 −0 src/instructlab/dolomite/hf_models/modeling_utils/normalization/layernorm/apex.py
  41. +64 −0 src/instructlab/dolomite/hf_models/modeling_utils/normalization/layernorm/apex_persistent.py
  42. +0 −81 src/instructlab/dolomite/hf_models/modeling_utils/normalization/norms.py
  43. +19 −0 src/instructlab/dolomite/hf_models/modeling_utils/normalization/rmsnorm/__init__.py
  44. +31 −0 src/instructlab/dolomite/hf_models/modeling_utils/normalization/rmsnorm/apex.py
  45. +13 −0 src/instructlab/dolomite/hf_models/modeling_utils/normalization/rmsnorm/base.py
  46. +205 −0 src/instructlab/dolomite/hf_models/modeling_utils/normalization/rmsnorm/torchtitan.py
  47. +1 −5 src/instructlab/dolomite/hf_models/modeling_utils/position_embedding/__init__.py
  48. +9 −56 src/instructlab/dolomite/hf_models/modeling_utils/position_embedding/alibi.py
  49. +95 −34 src/instructlab/dolomite/hf_models/modeling_utils/position_embedding/rope.py
  50. +1 −1 src/instructlab/dolomite/hf_models/models/__init__.py
  51. +1 −4 src/instructlab/dolomite/hf_models/models/gpt_dolomite/__init__.py
  52. +5 −732 src/instructlab/dolomite/hf_models/models/gpt_dolomite/base.py
  53. +5 −0 src/instructlab/dolomite/hf_models/models/gpt_dolomite/config.py
  54. +18 −33 src/instructlab/dolomite/hf_models/models/gpt_dolomite/layer.py
  55. +3 −183 src/instructlab/dolomite/hf_models/models/gpt_dolomite/main.py
  56. +23 −23 src/instructlab/dolomite/hf_models/models/gpt_dolomite/mlp.py
  57. +12 −13 src/instructlab/dolomite/hf_models/register_hf.py
  58. +58 −11 src/instructlab/dolomite/hf_models/utils.py
  59. +8 −16 src/instructlab/dolomite/utils/hf_hub.py
  60. +21 −27 src/instructlab/dolomite/utils/safetensors.py
8 changes: 6 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -10,7 +10,6 @@ authors = [
{ name="InstructLab", email="dev@instructlab.ai" },
]
description = "Dolomite Engine"
version = "0.0.1.dev"
readme = "README.md"
license = {text = "Apache-2.0"}
requires-python = ">=3.10"
@@ -28,7 +27,7 @@ classifiers = [
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: Implementation :: CPython",
]
dynamic = ["dependencies"]
dynamic = ["dependencies", "version"]

[project.scripts]

@@ -37,6 +36,11 @@ homepage = "https://instructlab.ai"
source = "https://github.com/instructlab/GPTDolomite"
issues = "https://github.com/instructlab/GPTDolomite/issues"

[tool.setuptools_scm]
version_file = "src/instructlab/dolomite/_version.py"
# do not include +gREV local version, required for Test PyPI upload
local_scheme = "no-local-version"

[tool.setuptools]
package-dir = {"" = "src"}

80 changes: 80 additions & 0 deletions src/instructlab/dolomite/enums.py
Original file line number Diff line number Diff line change
@@ -11,3 +11,83 @@ class ParamsGroupMethod(Enum):

class GradientCheckpointingMethod(Enum):
block = "block"


class LRDecaySchedule(str, Enum):
constant = "constant"
cosine = "cosine"
exponential = "exponential"
linear = "linear"
power = "power"


class AttentionImplementation(Enum):
"""
Enum class for attention implementation
"""

eager = "eager"
sdpa = "sdpa"
flash_attention_2 = "flash_attention_2"


class MoEImplementation(Enum):
"""
Enum class for MoE implementation
"""

eager = "eager"
scattermoe = "scattermoe"


class DatasetSplit(str, Enum):
"""dataset split"""

train = "train"
val = "val"
test = "test"


class Mode(str, Enum):
"""training / inference mode"""

training = "training"
inference = "inference"
unsharding = "unsharding"
distillation = "distillation"


class TuningMethod(str, Enum):
"""training method"""

pretraining = "pretraining"
full_finetuning = "full_finetuning"
prompt_tuning = "prompt_tuning"
lora = "lora"
distillation = "distillation"


class FP8Backend(str, Enum):
msamp = "msamp"
nvte = "nvte"


class LossMask(str, Enum):
"""Type of loss masking method"""

output_only = "output_only"
no_mask = "no_mask"


class KLDivergenceMethod(str, Enum):
"""Type of KL divergence"""

forward = "forward"
backward = "backward"


class ExperimentsTrackerName(str, Enum):
"""Experiment tracker to use"""

aim = "aim"
wandb = "wandb"
2 changes: 1 addition & 1 deletion src/instructlab/dolomite/hf_models/__init__.py
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@
# Extracted from https://github.com/ibm-granite/dolomite-engine
# ----------------------------------------------------------------
# Local
from .config import GPTDolomiteConfig
from .models.gpt_dolomite.config import GPTDolomiteConfig
from .model_conversion import export_to_huggingface, import_from_huggingface
from .models import GPTDolomiteForCausalLM, GPTDolomiteModel
from .register_hf import register_model_classes
43 changes: 12 additions & 31 deletions src/instructlab/dolomite/hf_models/config.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
# ----------------------------------------------------------------
# Extracted from https://github.com/ibm-granite/dolomite-engine
# ----------------------------------------------------------------
# Third Party
from transformers import PretrainedConfig

# Local
from .enums import AttentionHeadType, PositionEmbeddingType
from .enums import AttentionHeadType, InitMethod, PositionEmbeddingType


class GPTDolomiteConfig(PretrainedConfig):
model_type = "gpt_dolomite"
class CommonConfig(PretrainedConfig):
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"hidden_size": "n_embd",
@@ -18,20 +12,15 @@ class GPTDolomiteConfig(PretrainedConfig):
"num_hidden_layers": "n_layer",
}

# NOTE: initializer range is kept for backward compatiblity
# but it is not used anymore
# : also rope_scaling is not used anymore but kept for
# same reason.

def __init__(
self,
vocab_size: int = 50257,
n_positions: int = 1024,
n_embd: int = 768,
n_layer: int = 12,
n_head: int = 12,
num_key_value_heads: int = None,
n_inner: int = None,
num_key_value_heads: int | None = None,
n_inner: int | None = None,
activation_function: str = "gelu_pytorch_tanh",
attention_head_type: str = "mqa",
resid_pdrop: float = 0.1,
@@ -41,20 +30,19 @@ def __init__(
layer_norm_epsilon: float = 1e-5,
initializer_range: float = 0.02,
scale_attn_weights: bool = True,
attention_multiplier: float = None,
attention_multiplier: float | None = None,
use_cache: bool = True,
bos_token_id: int = 50256,
eos_token_id: int = 50256,
pad_token_id: int = 50256,
attention_softmax_in_fp32: bool = True,
scale_attention_softmax_in_fp32: bool = True,
add_bias: bool = True,
position_embedding_type: str = "learned_absolute",
rope_theta: int = 10000,
rope_scaling: dict = None,
m_emb: float = None,
m_width: float = None,
m_residual: float = None,
rope_scaling: dict | None = None,
m_emb: float | None = None,
m_width: float | None = None,
m_residual: float | None = None,
init_method: str = "normal",
upcast_logits_for_loss: bool = False,
**kwargs,
@@ -78,7 +66,6 @@ def __init__(
self.attention_multiplier = attention_multiplier
self.use_cache = use_cache
self.attention_softmax_in_fp32 = attention_softmax_in_fp32
self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32
self.position_embedding_type = position_embedding_type
self.add_bias = add_bias
self.rope_theta = rope_theta
@@ -93,6 +80,7 @@ def __init__(
assert self.scale_attn_weights

# check if enums are valid
init_method = InitMethod(init_method)
attention_head_type = AttentionHeadType(attention_head_type)
position_embedding_type = PositionEmbeddingType(position_embedding_type)

@@ -110,9 +98,7 @@ def __init__(
if self.num_key_value_heads is None:
self.num_key_value_heads = 1

assert (
self.num_key_value_heads == 1
), "MultiQueryAttention should have 1 head for keys and values"
assert self.num_key_value_heads == 1, "MultiQueryAttention should have 1 head for keys and values"
elif attention_head_type == AttentionHeadType.gqa:
assert (
self.num_key_value_heads is not None
@@ -122,9 +108,4 @@ def __init__(
self.n_head % self.num_key_value_heads == 0
), "GroupedQueryAttention should have more than 1 head for keys and values"

super().__init__(
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
pad_token_id=pad_token_id,
**kwargs,
)
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)
1 change: 1 addition & 0 deletions src/instructlab/dolomite/hf_models/defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
DEFAULT_NORMALIZATION_IMPLEMENTATION = "torch"
9 changes: 5 additions & 4 deletions src/instructlab/dolomite/hf_models/enums.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# ----------------------------------------------------------------
# Extracted from https://github.com/ibm-granite/dolomite-engine
# ----------------------------------------------------------------
# Standard
from enum import Enum


class InitMethod(Enum):
normal = "normal"
mup = "mup"


class PositionEmbeddingType(Enum):
"""
Enum class for position embeddings
4 changes: 4 additions & 0 deletions src/instructlab/dolomite/hf_models/mixins/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .dense import BaseModelMixin, CausalLMModelMixin, PreTrainedModelMixin
#from .dense_TP import BaseModelMixin_TP, CausalLMModelMixin_TP, PreTrainedModelMixin_TP
from .moe import BaseMoEModelMixin, CausalLMMoEModelMixin, PreTrainedMoEModelMixin
#from .moe_TP import BaseMoEModelMixin_TP, CausalLMMoEModelMixin_TP, PreTrainedMoEModelMixin_TP
2 changes: 2 additions & 0 deletions src/instructlab/dolomite/hf_models/mixins/dense/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .base import BaseModelMixin, PreTrainedModelMixin
from .main import CausalLMModelMixin
Loading