Skip to content

Commit 72d0eef

Browse files
committed
MDEV-37662: Binlog Corruption When tmpdir is Full
The binary log could be corrupted when committing a large transaction (i.e. one whose data exceeds the binlog_cache_size limit and spills into a tmp file) in binlog_format=row if the server's --tmp-dir is full. The corruption that happens is only the GTID of the errored transaction would be written into the binary log, without any body/finalizing events. This would happen because the content of the transaction wasn't flushed at the proper time, and the transaction's binlog cache data was not durable while trying to copy the content from the binlog cache file into the binary log itself. While switching the tmp file from a WRITE_CACHE to a READ_CACHE, the server would see there is still data to flush in the cache, and first try to flush it. This is not a valid time to flush that data to the temporary file though, as: 1. The GTID event has already been written directly to the binary log. So if this flushing fails, it leaves the binary log in a corrupted state. 2. This is done during group commit, and will slow down other concurrent transactions, which are otherwise ready to commit. This patch fixes these issues by ensuring all transaction data is fully flushed to its temporary file (if used) before starting any critical paths, i.e. in binlog_flush_cache(). Note that if the binlog cache is solely in-memory, this flush-to-temporary-file is skipped. Reviewed-by: TODO
1 parent 6fc9abc commit 72d0eef

File tree

3 files changed

+114
-8
lines changed

3 files changed

+114
-8
lines changed
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
include/master-slave.inc
2+
[connection master]
3+
connection master;
4+
set @old_binlog_cache_size= @@global.binlog_cache_size;
5+
set @@global.binlog_cache_size=4096;
6+
CALL mtr.add_suppression('Error writing file');
7+
#
8+
# Initialize test data
9+
connection master;
10+
create table t1 (a int, b longtext default NULL) engine=innodb;
11+
#
12+
# Create transaction with cache data larger than the binlog_cache_size
13+
# so it spills into a tmp file, then simulate ENOSPC while flushing
14+
# the tmp file.
15+
#
16+
set @@session.debug_dbug="+d,simulate_binlog_tmp_file_no_space_left_on_flush";
17+
insert into t1 values (2, repeat("y", 8192));
18+
ERROR HY000: Error writing file '.../tmp/<FILENAME>' (Errcode: 28 "No space left on device")
19+
set @@session.debug_dbug="";
20+
#
21+
# Create another transaction to make sure the server/replication can
22+
# continue working normally after the error
23+
#
24+
insert into t1 values (3, repeat("z", 8192));
25+
include/save_master_gtid.inc
26+
connection slave;
27+
include/sync_with_master_gtid.inc
28+
include/diff_tables.inc [master:test.t1,slave:test.t1]
29+
#
30+
# Cleanup
31+
connection master;
32+
drop table t1;
33+
include/save_master_gtid.inc
34+
connection slave;
35+
include/sync_with_master_gtid.inc
36+
connection master;
37+
set @@global.binlog_cache_size= @old_binlog_cache_size;
38+
include/rpl_end.inc
39+
# End of rpl_row_binlog_tmp_file_flush_enospc.test

mysql-test/suite/rpl/t/rpl_row_binlog_tmp_file_flush_enospc.test

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
--connection master
1919
set @old_binlog_cache_size= @@global.binlog_cache_size;
2020
set @@global.binlog_cache_size=4096;
21+
CALL mtr.add_suppression('Error writing file');
2122

2223
--echo #
2324
--echo # Initialize test data
@@ -30,6 +31,7 @@ create table t1 (a int, b longtext default NULL) engine=innodb;
3031
--echo # the tmp file.
3132
--echo #
3233
set @@session.debug_dbug="+d,simulate_binlog_tmp_file_no_space_left_on_flush";
34+
--replace_regex /Error writing file '[^']+\/tmp\/[^']+'/Error writing file '...\/tmp\/<FILENAME>'/
3335
--error 3
3436
insert into t1 values (2, repeat("y", 8192));
3537
set @@session.debug_dbug="";

sql/log.cc

Lines changed: 73 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1899,6 +1899,31 @@ static int binlog_close_connection(handlerton *hton, THD *thd)
18991899
DBUG_RETURN(0);
19001900
}
19011901

1902+
/*
1903+
Ensures that the input IO Cache is consistent with where its data is stored,
1904+
i.e that the data is entirely either stored in-memory or backed by a
1905+
temporary file. In actuality, it is simple: if the IO Cache is actively
1906+
backed by a temporary file (i.e. the transaction or statement data is
1907+
sufficiently large to exceed its respective binlog_cache_size), then ensure
1908+
all data is flushed to the temporary file. Otherwise, the data is in-memory
1909+
by default, and we don't need to do anything.
1910+
1911+
Returns TRUE on success, FALSE on error.
1912+
*/
1913+
static my_bool binlog_cache_reconcile_data_in_storage(IO_CACHE *info)
1914+
{
1915+
my_bool ret= 0;
1916+
DBUG_ENTER("binlog_flush_cache_log_to_disk");
1917+
DBUG_ASSERT(info);
1918+
DBUG_ASSERT(!info->error);
1919+
DBUG_EXECUTE_IF("simulate_binlog_tmp_file_no_space_left_on_flush",
1920+
{ DBUG_SET("+d,simulate_file_write_error"); });
1921+
ret= info->pos_in_file && flush_io_cache(info);
1922+
DBUG_EXECUTE_IF("simulate_binlog_tmp_file_no_space_left_on_flush",
1923+
{ DBUG_SET("-d,simulate_file_write_error"); });
1924+
DBUG_RETURN(ret);
1925+
}
1926+
19021927
/*
19031928
This function flushes a cache upon commit/rollback.
19041929
@@ -1938,9 +1963,34 @@ binlog_flush_cache(THD *thd, binlog_cache_mngr *cache_mngr,
19381963
(using_trx && !cache_mngr->trx_cache.empty()) ||
19391964
thd->transaction->xid_state.is_explicit_XA())
19401965
{
1941-
if (using_stmt && thd->binlog_flush_pending_rows_event(TRUE, FALSE))
1966+
/*
1967+
thd->binlog_flush_pending_rows_event() ensures that the pending row event
1968+
is flushed into the respective IO cache. We also need to make sure that
1969+
the IO cache is consistent where its data is stored, i.e. it should
1970+
either be entirely in-memory or backed by a temporary file. So if
1971+
necessary (i.e if the cache data exceeds its binlog_cache_size), flush
1972+
the IO cache to its tmp file on disk.
1973+
1974+
Technically, this reconciliation would happen automatically when writing
1975+
the cache data to the actual binlog file. We pre-empt it though because:
1976+
1) we write the GTID event separately to the binlog directly before
1977+
moving the cache data, and if the reconciliation fails (e.g. if the
1978+
directory storing the tmp file is full), the binlog would get
1979+
corrupted with a standalone GTID event
1980+
2) that happens during group commit with locks held, and other
1981+
ready-to-commit (concurrent) transactions could be stalled
1982+
*/
1983+
if (using_stmt && !thd->binlog_flush_pending_rows_event(TRUE, FALSE) &&
1984+
binlog_cache_reconcile_data_in_storage(
1985+
cache_mngr->get_binlog_cache_log(FALSE)))
19421986
DBUG_RETURN(1);
1943-
if (using_trx && thd->binlog_flush_pending_rows_event(TRUE, TRUE))
1987+
1988+
/*
1989+
See statment cache comment above.
1990+
*/
1991+
if (using_trx && !thd->binlog_flush_pending_rows_event(TRUE, TRUE) &&
1992+
binlog_cache_reconcile_data_in_storage(
1993+
cache_mngr->get_binlog_cache_log(TRUE)))
19441994
DBUG_RETURN(1);
19451995

19461996
#ifdef WITH_WSREP
@@ -8987,12 +9037,27 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry,
89879037
}
89889038
#endif /* WITH_WSREP */
89899039

8990-
/*
8991-
An error in the trx_cache will truncate the cache to the last good
8992-
statement, it won't leave a lingering error. Assert that this holds.
8993-
*/
8994-
DBUG_ASSERT(!(entry->using_trx_cache && !mngr->trx_cache.empty() &&
8995-
mngr->get_binlog_cache_log(TRUE)->error));
9040+
9041+
#ifndef DBUG_OFF
9042+
if (entry->using_trx_cache)
9043+
{
9044+
IO_CACHE *cache= mngr->get_binlog_cache_log(TRUE);
9045+
/*
9046+
An error in the trx_cache will truncate the cache to the last good
9047+
statement, it won't leave a lingering error. Assert that this holds.
9048+
*/
9049+
DBUG_ASSERT(mngr->trx_cache.empty() ||
9050+
!cache->error);
9051+
9052+
/*
9053+
If the transaction uses the IO cache temporary file (i.e if it is
9054+
sufficiently large), it should be fully flushed to disk by now.
9055+
*/
9056+
DBUG_ASSERT(!cache->pos_in_file ||
9057+
cache->write_pos == cache->write_buffer);
9058+
}
9059+
#endif
9060+
89969061
/*
89979062
An error in the stmt_cache would be caught on the higher level and result
89989063
in an incident event being written over a (possibly corrupt) cache content.

0 commit comments

Comments
 (0)