From 4b07f15fb667116712a31fce06e0752c9bc982e3 Mon Sep 17 00:00:00 2001 From: Jon Bolin Date: Tue, 19 Dec 2023 19:49:45 +0000 Subject: [PATCH] Remove block size limitation --- examples/pytorch/language-modeling/run_clm.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index ff95ab179398..95b9a346737c 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -681,12 +681,13 @@ def tokenize_function(examples): ) block_size = min(1024, config.max_position_embeddings) else: - if data_args.block_size > tokenizer.model_max_length: - logger.warning( - f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model " - f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." - ) - block_size = min(data_args.block_size, tokenizer.model_max_length) + #if data_args.block_size > tokenizer.model_max_length: + # logger.warning( + # f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model " + # f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + # ) + #block_size = min(data_args.block_size, tokenizer.model_max_length) + block_size = data_args.block_size # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples):