Skip to content

Add more choices to quantization tool. Post processing after sim_anneal(). (optimizer.py/ext_quant.cpp) #712

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 40 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
d45c2b1
Change malloc to calloc
Originalimoc Jan 10, 2025
a129ee9
reverse VRAM scratching
Originalimoc Jan 10, 2025
782d26c
improvement v1
Originalimoc Jan 7, 2025
056c60e
improvement v2
Originalimoc Jan 7, 2025
272c31d
improvement v3-1
Originalimoc Jan 7, 2025
117a60a
improvement v3-2
Originalimoc Jan 7, 2025
8483153
improvement v3-3
Originalimoc Jan 7, 2025
21a4d9c
improvement v3-4
Originalimoc Jan 7, 2025
8918b24
improvement v3-5
Originalimoc Jan 7, 2025
2282621
improvement v3-5-1
Originalimoc Jan 8, 2025
556d0e4
improvement v3-5-2
Originalimoc Jan 9, 2025
02cfba8
improvement v3-5-3
Originalimoc Jan 9, 2025
a0f0e90
improvement v3-5-4
Originalimoc Jan 9, 2025
305f312
improvement v3-5-5
Originalimoc Jan 9, 2025
bb120fc
improvement v3-5-6
Originalimoc Jan 10, 2025
165c909
improvement v3-5-f
Originalimoc Jan 10, 2025
baaa786
update modes for v3-5-f
Originalimoc Jan 13, 2025
0ed200f
Merge branch 'turboderp-org:master' into master
Originalimoc Feb 10, 2025
a344009
Add Qwen3ForCausalLM
turboderp Apr 29, 2025
b422a85
Merge branch 'master' into dev
turboderp Apr 29, 2025
68976a0
Add basic support for Qwen3MoE
turboderp May 1, 2025
747fbad
Merge branch 'master' into dev
turboderp May 3, 2025
e312b74
Fix unload() for vision tower
turboderp May 7, 2025
c820539
Actions: Add redirects to CUDA downloads
kingbri1 May 9, 2025
aa2d5aa
Merge pull request #788 from turboderp-org/dev
kingbri1 May 12, 2025
9d62150
Project: Bump version
kingbri1 May 12, 2025
0a77331
Ext: Fix CUDA type cast
kingbri1 May 12, 2025
bb4206d
Ext: Fix register call for float
kingbri1 May 12, 2025
0a3d420
Actions: Build rocm only
kingbri1 May 12, 2025
a87ea02
Remove SentencePiece support
turboderp May 14, 2025
1adff7d
Remove SentencePiece support
turboderp May 14, 2025
a811641
Optimize paged cache defrag
turboderp May 26, 2025
97e4fd9
Fail if tokenizer.json not found
turboderp May 27, 2025
a08ef4f
ExllamaV2: Bump version
kingbri1 May 27, 2025
2b20c24
Merge branch 'dev'
kingbri1 May 27, 2025
b311d0a
Remove sentencepiece dep from setup.py
turboderp May 28, 2025
0efb999
Merge remote-tracking branch 'origin/dev' into dev
turboderp May 28, 2025
2ca8281
Merge branch 'dev'
turboderp May 28, 2025
99d9382
Add more choices to quantization tool
Originalimoc Jun 17, 2025
c02e3e6
Merge with 0.3.1
Originalimoc Jun 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 54 additions & 54 deletions .github/workflows/build-wheels-release.yml

Large diffs are not rendered by default.

94 changes: 47 additions & 47 deletions .github/workflows/build-wheels-release_torch27_only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,58 +68,58 @@ jobs:
# Windows 2022 CUDA

# Python 3.10
- { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.6.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.4.0', rocm: '', torch: '2.6.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.8.1', rocm: '', torch: '2.7.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX' }

# Python 3.11
- { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.6.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.4.0', rocm: '', torch: '2.6.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.8.1', rocm: '', torch: '2.7.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX' }

# Python 3.12
- { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.6.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.4.0', rocm: '', torch: '2.6.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.8.1', rocm: '', torch: '2.7.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX' }

# Python 3.13
- { artname: 'wheel', os: windows-2022, pyver: '3.13', cuda: '11.8.0', rocm: '', torch: '2.6.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.13', cuda: '12.4.0', rocm: '', torch: '2.6.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
- { artname: 'wheel', os: windows-2022, pyver: '3.13', cuda: '12.8.1', rocm: '', torch: '2.7.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '11.8.0', rocm: '', torch: '2.6.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.4.0', rocm: '', torch: '2.6.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.10', cuda: '12.8.1', rocm: '', torch: '2.7.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX' }

# # Python 3.11
# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '11.8.0', rocm: '', torch: '2.6.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.4.0', rocm: '', torch: '2.6.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.11', cuda: '12.8.1', rocm: '', torch: '2.7.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX' }

# # Python 3.12
# - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.3.1', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.4.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.1.0', rocm: '', torch: '2.5.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '11.8.0', rocm: '', torch: '2.6.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.4.0', rocm: '', torch: '2.6.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.12', cuda: '12.8.1', rocm: '', torch: '2.7.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX' }

# # Python 3.13
# - { artname: 'wheel', os: windows-2022, pyver: '3.13', cuda: '11.8.0', rocm: '', torch: '2.6.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.13', cuda: '12.4.0', rocm: '', torch: '2.6.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
# - { artname: 'wheel', os: windows-2022, pyver: '3.13', cuda: '12.8.1', rocm: '', torch: '2.7.0', cudaarch: '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX' }

# # Ubuntu 20.04 ROCm

# # ROCm 5.6
# - { artname: 'wheel', os: ubuntu-22.04, pyver: '3.10', cuda: '', rocm: '5.6', torch: '2.2.2', cudaarch: '' }
# - { artname: 'wheel', os: ubuntu-22.04, pyver: '3.11', cuda: '', rocm: '5.6', torch: '2.2.2', cudaarch: '' }
# ROCm 5.6
- { artname: 'wheel', os: ubuntu-22.04, pyver: '3.10', cuda: '', rocm: '5.6', torch: '2.2.2', cudaarch: '' }
- { artname: 'wheel', os: ubuntu-22.04, pyver: '3.11', cuda: '', rocm: '5.6', torch: '2.2.2', cudaarch: '' }

# # ROCm 6.0
# - { artname: 'wheel', os: ubuntu-22.04, pyver: '3.10', cuda: '', rocm: '6.0', torch: '2.3.1', cudaarch: '' }
# - { artname: 'wheel', os: ubuntu-22.04, pyver: '3.11', cuda: '', rocm: '6.0', torch: '2.3.1', cudaarch: '' }
# - { artname: 'wheel', os: ubuntu-22.04, pyver: '3.12', cuda: '', rocm: '6.0', torch: '2.3.1', cudaarch: '' }
# ROCm 6.0
- { artname: 'wheel', os: ubuntu-22.04, pyver: '3.10', cuda: '', rocm: '6.0', torch: '2.3.1', cudaarch: '' }
- { artname: 'wheel', os: ubuntu-22.04, pyver: '3.11', cuda: '', rocm: '6.0', torch: '2.3.1', cudaarch: '' }
- { artname: 'wheel', os: ubuntu-22.04, pyver: '3.12', cuda: '', rocm: '6.0', torch: '2.3.1', cudaarch: '' }

# # ROCm 6.1
# - { artname: 'wheel', os: ubuntu-22.04, pyver: '3.10', cuda: '', rocm: '6.1', torch: '2.4.0', cudaarch: '' }
# - { artname: 'wheel', os: ubuntu-22.04, pyver: '3.11', cuda: '', rocm: '6.1', torch: '2.4.0', cudaarch: '' }
# - { artname: 'wheel', os: ubuntu-22.04, pyver: '3.12', cuda: '', rocm: '6.1', torch: '2.4.0', cudaarch: '' }
# ROCm 6.1
- { artname: 'wheel', os: ubuntu-22.04, pyver: '3.10', cuda: '', rocm: '6.1', torch: '2.4.0', cudaarch: '' }
- { artname: 'wheel', os: ubuntu-22.04, pyver: '3.11', cuda: '', rocm: '6.1', torch: '2.4.0', cudaarch: '' }
- { artname: 'wheel', os: ubuntu-22.04, pyver: '3.12', cuda: '', rocm: '6.1', torch: '2.4.0', cudaarch: '' }

# # sdist
# - { artname: 'sdist', os: ubuntu-22.04, pyver: '3.11', cuda: '', rocm: '', torch: '2.3.1', cudaarch: '' }
Expand Down
37 changes: 37 additions & 0 deletions exllamav2/architecture.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@
["block_sparse_moe.experts.*.w2"],
["block_sparse_moe.experts.*.w3"],
["block_sparse_moe.gate"]]
layer_keys_qwen3moe_mlp = [["mlp.experts.*.gate_proj"],
["mlp.experts.*.up_proj"],
["mlp.experts.*.down_proj"],
["mlp.gate"]]
layer_keys_dbrx_mlp = [["block_sparse_moe.experts.*.v1", "block_sparse_moe.experts.v1"],
["block_sparse_moe.experts.*.w1", "block_sparse_moe.experts.w1"],
["block_sparse_moe.experts.*.w2", "block_sparse_moe.experts.w2"],
Expand Down Expand Up @@ -428,6 +432,39 @@ class Params:
self.lm.attention_bias_qkv = True
self.lm.supports_tp = True

# Qwen3

if arch_string == "Qwen3ForCausalLM":
arch_recognized = True
self.lm.layer_keys += \
layer_keys_llama_norms + \
layer_keys_llama_attn + \
layer_keys_llama_mlp
self.lm.expect_keys += \
expect_keys_llama
self.lm.supports_tp = True
self.lm.default_use_qk_norm = True

# Qwen3MoE

if arch_string == "Qwen3MoeForCausalLM":
arch_recognized = True
self.lm.layer_keys += \
layer_keys_llama_norms + \
layer_keys_llama_attn + \
layer_keys_qwen3moe_mlp
self.lm.expect_keys += \
expect_keys_llama
self.lm.supports_tp = True
self.lm.default_use_qk_norm = True
self.lm.keys.update({
"mlp_gate": ".mlp.experts.*.gate_proj",
"mlp_up": ".mlp.experts.*.up_proj",
"mlp_down": ".mlp.experts.*.down_proj",
"mlp_expert_gate": ".mlp.gate"
})
self.lm.is_moe = True

# Qwen2-VL (2, 2.5)

if arch_string in ["Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration"]:
Expand Down
5 changes: 4 additions & 1 deletion exllamav2/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,9 +319,12 @@ def prepare(self, no_tensors: bool = False):
default_intermediate_size,
opt_subkey = "text_config",
)
self.num_experts = read(read_config, int, ["num_local_experts", "ffn_config->moe_num_experts"], None)
self.num_experts = read(read_config, int, ["num_local_experts", "ffn_config->moe_num_experts", "num_experts"], None)
self.num_experts_per_token = read(read_config, int,["num_experts_per_tok", "ffn_config->moe_top_k"], None)

if self.arch.lm.is_moe:
self.intermediate_size = read(read_config, int, ["moe_intermediate_size"], self.intermediate_size)

# Logit/embedding/residual scale

self.logit_scale = read(read_config, float, "logit_scale", 1)
Expand Down
5 changes: 4 additions & 1 deletion exllamav2/conversion/adaptivegptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,10 @@ def prepare(self, no_h_inv = False):

with torch.inference_mode():

self.hessian /= self.num_batches
if self.hessian is None or self.num_batches == 0:
self.hessian = torch.eye(self.rows, device = self.device, dtype = torch.float)
else:
self.hessian /= self.num_batches
diagonal = torch.diag(self.hessian)

# Prepare weights
Expand Down
10 changes: 5 additions & 5 deletions exllamav2/exllamav2_ext/cpp/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ void apply_rep_penalty_cpu
// {
// if (g_rep_mask) free(g_rep_mask);
// g_vocab_size = vocab_size;
// g_rep_mask = (bool*) malloc(g_vocab_size * sizeof(bool));
// g_rep_mask = (bool*) calloc(1, g_vocab_size * sizeof(bool));
// }
// memset(g_rep_mask, 0, g_vocab_size * sizeof(bool));
bool* g_rep_mask = (bool*) calloc(vocab_size, sizeof(bool));
Expand Down Expand Up @@ -655,7 +655,7 @@ int tfs_cpu

int nc = sort_descending(num_candidates, temp_probs, temp_indices, num_candidates);

float* derivative = (float*) malloc(nc * sizeof(float));
float* derivative = (float*) calloc(1, nc * sizeof(float));
float dsum = 0.0f;
for (int i = 0; i < nc - 2; i++)
{
Expand Down Expand Up @@ -759,9 +759,9 @@ int typical_cpu

int r_candidates = pre_sort_descending(num_candidates, temp_probs, temp_indices);

float* temp = (float*) malloc(r_candidates * sizeof(float));
int* entropy_dev_order = (int*) malloc(r_candidates * sizeof(int));
int* temp_indices_2 = (int*) malloc(r_candidates * sizeof(int));
float* temp = (float*) calloc(1, r_candidates * sizeof(float));
int* entropy_dev_order = (int*) calloc(1, r_candidates * sizeof(int));
int* temp_indices_2 = (int*) calloc(1, r_candidates * sizeof(int));

float neg_entropy = 0.0f;
for (int i = 0; i < r_candidates; i++)
Expand Down
Loading