diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index ff95ab179398..95b9a346737c 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -681,12 +681,13 @@ def tokenize_function(examples): ) block_size = min(1024, config.max_position_embeddings) else: - if data_args.block_size > tokenizer.model_max_length: - logger.warning( - f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model " - f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." - ) - block_size = min(data_args.block_size, tokenizer.model_max_length) + #if data_args.block_size > tokenizer.model_max_length: + # logger.warning( + # f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model " + # f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + # ) + #block_size = min(data_args.block_size, tokenizer.model_max_length) + block_size = data_args.block_size # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples):