diff --git a/mysql-test/suite/rpl/r/rpl_row_binlog_tmp_file_flush_enospc.result b/mysql-test/suite/rpl/r/rpl_row_binlog_tmp_file_flush_enospc.result new file mode 100644 index 0000000000000..91e4c7f8e31b7 --- /dev/null +++ b/mysql-test/suite/rpl/r/rpl_row_binlog_tmp_file_flush_enospc.result @@ -0,0 +1,39 @@ +include/master-slave.inc +[connection master] +connection master; +set @old_binlog_cache_size= @@global.binlog_cache_size; +set @@global.binlog_cache_size=4096; +CALL mtr.add_suppression('Error writing file'); +# +# Initialize test data +connection master; +create table t1 (a int, b longtext default NULL) engine=innodb; +# +# Create transaction with cache data larger than the binlog_cache_size +# so it spills into a tmp file, then simulate ENOSPC while flushing +# the tmp file. +# +set @@session.debug_dbug="+d,simulate_binlog_tmp_file_no_space_left_on_flush"; +insert into t1 values (2, repeat("y", 8192)); +ERROR HY000: Error writing file '.../tmp/' (Errcode: 28 "No space left on device") +set @@session.debug_dbug=""; +# +# Create another transaction to make sure the server/replication can +# continue working normally after the error +# +insert into t1 values (3, repeat("z", 8192)); +include/save_master_gtid.inc +connection slave; +include/sync_with_master_gtid.inc +include/diff_tables.inc [master:test.t1,slave:test.t1] +# +# Cleanup +connection master; +drop table t1; +include/save_master_gtid.inc +connection slave; +include/sync_with_master_gtid.inc +connection master; +set @@global.binlog_cache_size= @old_binlog_cache_size; +include/rpl_end.inc +# End of rpl_row_binlog_tmp_file_flush_enospc.test diff --git a/mysql-test/suite/rpl/t/rpl_row_binlog_tmp_file_flush_enospc.test b/mysql-test/suite/rpl/t/rpl_row_binlog_tmp_file_flush_enospc.test new file mode 100644 index 0000000000000..d9cc4fa6ef9f5 --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_row_binlog_tmp_file_flush_enospc.test @@ -0,0 +1,64 @@ +# +# This test ensures that a large transaction (i.e. one whose data exceeds the +# binlog_cache_size limit and spills into a tmp file) that errors while +# flushing the tmp file (e.g. due to no space on disk) does not corrupt the +# binlog, and that the server/replication can continue working normally after +# the error. To simulate the error, the test uses debug_dbug to inject a +# failure in the my_write() function, which is called when flushing the tmp +# file. +# +# References: +# MDEV-37662: Binlog Corruption When tmpdir is Full +# +--source include/have_debug.inc +--source include/have_innodb.inc +--source include/have_binlog_format_row.inc +--source include/master-slave.inc + +--connection master +set @old_binlog_cache_size= @@global.binlog_cache_size; +set @@global.binlog_cache_size=4096; +CALL mtr.add_suppression('Error writing file'); + +--echo # +--echo # Initialize test data +--connection master +create table t1 (a int, b longtext default NULL) engine=innodb; + +--echo # +--echo # Create transaction with cache data larger than the binlog_cache_size +--echo # so it spills into a tmp file, then simulate ENOSPC while flushing +--echo # the tmp file. +--echo # +set @@session.debug_dbug="+d,simulate_binlog_tmp_file_no_space_left_on_flush"; +--replace_regex /Error writing file '[^']+\/tmp\/[^']+'/Error writing file '...\/tmp\/'/ +--error 3 +insert into t1 values (2, repeat("y", 8192)); +set @@session.debug_dbug=""; + +--echo # +--echo # Create another transaction to make sure the server/replication can +--echo # continue working normally after the error +--echo # +insert into t1 values (3, repeat("z", 8192)); +--source include/save_master_gtid.inc +--connection slave +--source include/sync_with_master_gtid.inc + +--let $diff_tables=master:test.t1,slave:test.t1 +--source include/diff_tables.inc + +--echo # +--echo # Cleanup +--connection master +drop table t1; +--source include/save_master_gtid.inc + +--connection slave +--source include/sync_with_master_gtid.inc + +--connection master +set @@global.binlog_cache_size= @old_binlog_cache_size; + +--source include/rpl_end.inc +--echo # End of rpl_row_binlog_tmp_file_flush_enospc.test diff --git a/mysys/my_write.c b/mysys/my_write.c index 06914b2564108..68f568605fc8f 100644 --- a/mysys/my_write.c +++ b/mysys/my_write.c @@ -57,6 +57,7 @@ size_t my_write(File Filedes, const uchar *Buffer, size_t Count, myf MyFlags) if (!errors) { errno= ENOSPC; writtenbytes= (size_t) -1; + MyFlags&= ~MY_WAIT_IF_FULL; }); if (writtenbytes == Count) diff --git a/sql/log.cc b/sql/log.cc index 8339d7c5b2395..5f3118b8303d2 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -1899,6 +1899,31 @@ static int binlog_close_connection(handlerton *hton, THD *thd) DBUG_RETURN(0); } +/* + Ensures that the input IO Cache is consistent with where its data is stored, + i.e that the data is entirely either stored in-memory or backed by a + temporary file. In actuality, it is simple: if the IO Cache is actively + backed by a temporary file (i.e. the transaction or statement data is + sufficiently large to exceed its respective binlog_cache_size), then ensure + all data is flushed to the temporary file. Otherwise, the data is in-memory + by default, and we don't need to do anything. + + Returns TRUE on success, FALSE on error. +*/ +static my_bool binlog_cache_reconcile_data_in_storage(IO_CACHE *info) +{ + my_bool ret= 0; + DBUG_ENTER("binlog_flush_cache_log_to_disk"); + DBUG_ASSERT(info); + DBUG_ASSERT(!info->error); + DBUG_EXECUTE_IF("simulate_binlog_tmp_file_no_space_left_on_flush", + { DBUG_SET("+d,simulate_file_write_error"); }); + ret= info->pos_in_file && flush_io_cache(info); + DBUG_EXECUTE_IF("simulate_binlog_tmp_file_no_space_left_on_flush", + { DBUG_SET("-d,simulate_file_write_error"); }); + DBUG_RETURN(ret); +} + /* This function flushes a cache upon commit/rollback. @@ -1938,9 +1963,34 @@ binlog_flush_cache(THD *thd, binlog_cache_mngr *cache_mngr, (using_trx && !cache_mngr->trx_cache.empty()) || thd->transaction->xid_state.is_explicit_XA()) { - if (using_stmt && thd->binlog_flush_pending_rows_event(TRUE, FALSE)) + /* + thd->binlog_flush_pending_rows_event() ensures that the pending row event + is flushed into the respective IO cache. We also need to make sure that + the IO cache is consistent where its data is stored, i.e. it should + either be entirely in-memory or backed by a temporary file. So if + necessary (i.e if the cache data exceeds its binlog_cache_size), flush + the IO cache to its tmp file on disk. + + Technically, this reconciliation would happen automatically when writing + the cache data to the actual binlog file. We pre-empt it though because: + 1) we write the GTID event separately to the binlog directly before + moving the cache data, and if the reconciliation fails (e.g. if the + directory storing the tmp file is full), the binlog would get + corrupted with a standalone GTID event + 2) that happens during group commit with locks held, and other + ready-to-commit (concurrent) transactions could be stalled + */ + if (using_stmt && !thd->binlog_flush_pending_rows_event(TRUE, FALSE) && + binlog_cache_reconcile_data_in_storage( + cache_mngr->get_binlog_cache_log(FALSE))) DBUG_RETURN(1); - if (using_trx && thd->binlog_flush_pending_rows_event(TRUE, TRUE)) + + /* + See statment cache comment above. + */ + if (using_trx && !thd->binlog_flush_pending_rows_event(TRUE, TRUE) && + binlog_cache_reconcile_data_in_storage( + cache_mngr->get_binlog_cache_log(TRUE))) DBUG_RETURN(1); #ifdef WITH_WSREP @@ -7708,8 +7758,12 @@ int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache) DBUG_ENTER("MYSQL_BIN_LOG::write_cache"); mysql_mutex_assert_owner(&LOCK_log); + DBUG_EXECUTE_IF("simulate_binlog_tmp_file_no_space_left_on_flush", + { DBUG_SET("+d,simulate_file_write_error"); }); if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0)) DBUG_RETURN(ER_ERROR_ON_WRITE); + DBUG_EXECUTE_IF("simulate_binlog_tmp_file_no_space_left_on_flush", + { DBUG_SET("-d,simulate_file_write_error"); }); size_t length= my_b_bytes_in_cache(cache), group, carry, hdr_offs; size_t val; size_t end_log_pos_inc= 0; // each event processed adds BINLOG_CHECKSUM_LEN 2 t @@ -8983,12 +9037,27 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry, } #endif /* WITH_WSREP */ - /* - An error in the trx_cache will truncate the cache to the last good - statement, it won't leave a lingering error. Assert that this holds. - */ - DBUG_ASSERT(!(entry->using_trx_cache && !mngr->trx_cache.empty() && - mngr->get_binlog_cache_log(TRUE)->error)); + +#ifndef DBUG_OFF + if (entry->using_trx_cache) + { + IO_CACHE *cache= mngr->get_binlog_cache_log(TRUE); + /* + An error in the trx_cache will truncate the cache to the last good + statement, it won't leave a lingering error. Assert that this holds. + */ + DBUG_ASSERT(mngr->trx_cache.empty() || + !cache->error); + + /* + If the transaction uses the IO cache temporary file (i.e if it is + sufficiently large), it should be fully flushed to disk by now. + */ + DBUG_ASSERT(!cache->pos_in_file || + cache->write_pos == cache->write_buffer); + } +#endif + /* An error in the stmt_cache would be caught on the higher level and result in an incident event being written over a (possibly corrupt) cache content.