Implement visual tokenizers with PyTorch.
Vector Quantization Methods:
- Vector Quantization (VQVAE, VQGAN)
- Residual Quantization (RQVAE)
- Finite Scalar Quantization (FSQ)
- Lookup-Free Quantization (LFQ)
- Binary Spherical Quantization (BSQ)
- SimVQ
- Index Backpropogate Quantization (IBQ)
- Grouped Spherical Quantization (GSQ)
ImageNet 256x256 Reproduction:
- VQGAN (Taming-Transformers)
- VQGAN (LlamaGen)
- ViT-VQGAN
- VQGAN+ (from MaskBit)
- TiTok
The code is tested with python 3.11, torch 2.1.1 and cuda 12.1.
Clone this repo:
git clone https://github.com/xyfJASON/visual-tokenizer-pytorch.git
cd visual-tokenizer-pytorch
Create and activate a conda environment:
conda create -n vistok python=3.11
conda activate vistok
Install dependencies:
pip install torch==2.1.1 torchvision==0.16.1 --index-url https://download.pytorch.org/whl/cu121
pip install -r requirements.txt
See benchmarks for more details.
torchrun --nproc-per-node 1 train_vqvae.py -c CONFIG -e EXP_DIR
torchrun --nproc-per-node 8 train_vqgan.py -c CONFIG -e EXP_DIR
torchrun --nproc-per-node 8 evaluate.py \
-c CONFIG \
--weights WEIGHTS \
[--save_dir SAVE_DIR] \
[--bspp BATCH_SIZE_PER_PROC] \
[--seed SEED]
VQVAE:
@article{van2017neural,
title={Neural discrete representation learning},
author={Van Den Oord, Aaron and Vinyals, Oriol and others},
journal={Advances in neural information processing systems},
volume={30},
year={2017}
}
VQGAN (Taming Transformers):
@inproceedings{esser2021taming,
title={Taming transformers for high-resolution image synthesis},
author={Esser, Patrick and Rombach, Robin and Ommer, Bjorn},
booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
pages={12873--12883},
year={2021}
}
ViT-VQGAN:
@inproceedings{yu2022vectorquantized,
title={Vector-quantized Image Modeling with Improved {VQGAN}},
author={Jiahui Yu and Xin Li and Jing Yu Koh and Han Zhang and Ruoming Pang and James Qin and Alexander Ku and Yuanzhong Xu and Jason Baldridge and Yonghui Wu},
booktitle={International Conference on Learning Representations},
year={2022},
url={https://openreview.net/forum?id=pfNyExj7z2}
}
MaskGIT:
@inproceedings{chang2022maskgit,
title={Maskgit: Masked generative image transformer},
author={Chang, Huiwen and Zhang, Han and Jiang, Lu and Liu, Ce and Freeman, William T},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={11315--11325},
year={2022}
}
VQGAN (LlamaGen):
@article{sun2024autoregressive,
title={Autoregressive Model Beats Diffusion: Llama for Scalable Image Generation},
author={Sun, Peize and Jiang, Yi and Chen, Shoufa and Zhang, Shilong and Peng, Bingyue and Luo, Ping and Yuan, Zehuan},
journal={arXiv preprint arXiv:2406.06525},
year={2024}
}
FSQ:
@inproceedings{mentzer2024finite,
title={Finite Scalar Quantization: {VQ}-{VAE} Made Simple},
author={Fabian Mentzer and David Minnen and Eirikur Agustsson and Michael Tschannen},
booktitle={The Twelfth International Conference on Learning Representations},
year={2024},
url={https://openreview.net/forum?id=8ishA3LxN8}
}
LFQ (MAGVIT-v2):
@inproceedings{yu2024language,
title={Language Model Beats Diffusion - Tokenizer is key to visual generation},
author={Lijun Yu and Jose Lezama and Nitesh Bharadwaj Gundavarapu and Luca Versari and Kihyuk Sohn and David Minnen and Yong Cheng and Agrim Gupta and Xiuye Gu and Alexander G Hauptmann and Boqing Gong and Ming-Hsuan Yang and Irfan Essa and David A Ross and Lu Jiang},
booktitle={The Twelfth International Conference on Learning Representations},
year={2024},
url={https://openreview.net/forum?id=gzqrANCF4g}
}
BSQ:
@article{zhao2024image,
title={Image and Video Tokenization with Binary Spherical Quantization},
author={Zhao, Yue and Xiong, Yuanjun and Kr{\"a}henb{\"u}hl, Philipp},
journal={arXiv preprint arXiv:2406.07548},
year={2024}
}
MaskBit:
@article{weber2024maskbit,
title={Maskbit: Embedding-free image generation via bit tokens},
author={Weber, Mark and Yu, Lijun and Yu, Qihang and Deng, Xueqing and Shen, Xiaohui and Cremers, Daniel and Chen, Liang-Chieh},
journal={arXiv preprint arXiv:2409.16211},
year={2024}
}
SimVQ:
@article{zhu2024addressing,
title={Addressing Representation Collapse in Vector Quantized Models with One Linear Layer},
author={Zhu, Yongxin and Li, Bocheng and Xin, Yifei and Xu, Linli},
journal={arXiv preprint arXiv:2411.02038},
year={2024}
}
IBQ:
@article{shi2024taming,
title={Taming Scalable Visual Tokenizer for Autoregressive Image Generation},
author={Shi, Fengyuan and Luo, Zhuoyan and Ge, Yixiao and Yang, Yujiu and Shan, Ying and Wang, Limin},
journal={arXiv preprint arXiv:2412.02692},
year={2024}
}
GSQ:
@article{wang2024scaling,
title={Scaling Image Tokenizers with Grouped Spherical Quantization},
author={Wang, Jiangtao and Qin, Zhen and Zhang, Yifan and Hu, Vincent Tao and Ommer, Bj{\"o}rn and Briq, Rania and Kesselheim, Stefan},
journal={arXiv preprint arXiv:2412.02632},
year={2024}
}