@@ -8,10 +8,11 @@ use std::sync::atomic::{AtomicU64, Ordering};
8
8
use std:: sync:: Arc ;
9
9
10
10
use anyhow:: ensure;
11
- use bytemuck:: { cast_ref, try_from_bytes, Pod , Zeroable } ;
11
+ use bytemuck:: { bytes_of , cast_ref, try_from_bytes, Pod , Zeroable } ;
12
12
use bytes:: { BufMut , Bytes , BytesMut } ;
13
13
use crc:: Crc ;
14
14
use parking_lot:: Mutex ;
15
+ use rusqlite:: ffi:: SQLITE_ERROR ;
15
16
use uuid:: Uuid ;
16
17
17
18
use crate :: libsql:: ffi:: {
@@ -35,6 +36,14 @@ pub struct ReplicationLoggerHook {
35
36
logger : Arc < ReplicationLogger > ,
36
37
}
37
38
39
+ /// This implementation of WalHook intercepts calls to `on_frame`, and writes them to a
40
+ /// shadow wal. Writing to the shadow wal is done in three steps:
41
+ /// i. append the new pages at the offset pointed by header.start_frame_index + header.frame_count
42
+ /// ii. call the underlying implementation of on_frames
43
+ /// iii. if the call of the underlying method was successfull, update the log header to the new
44
+ /// frame count.
45
+ ///
46
+ /// If either writing to the database of to the shadow wal fails, it must be noop.
38
47
unsafe impl WalHook for ReplicationLoggerHook {
39
48
fn on_frames (
40
49
& mut self ,
@@ -48,6 +57,22 @@ unsafe impl WalHook for ReplicationLoggerHook {
48
57
) -> c_int {
49
58
assert_eq ! ( page_size, 4096 ) ;
50
59
60
+ for ( page_no, data) in PageHdrIter :: new ( page_headers, page_size as _ ) {
61
+ self . write_frame ( page_no, data)
62
+ }
63
+
64
+ let commit_info = if is_commit != 0 {
65
+ match self . flush ( ntruncate) {
66
+ Err ( e) => {
67
+ tracing:: error!( "error writing to replication log: {e}" ) ;
68
+ return SQLITE_ERROR ;
69
+ }
70
+ Ok ( commit_info) => commit_info,
71
+ }
72
+ } else {
73
+ None
74
+ } ;
75
+
51
76
let rc = unsafe {
52
77
orig (
53
78
wal,
@@ -58,16 +83,11 @@ unsafe impl WalHook for ReplicationLoggerHook {
58
83
sync_flags,
59
84
)
60
85
} ;
61
- if rc != crate :: libsql:: ffi:: SQLITE_OK {
62
- return rc;
63
- }
64
86
65
- for ( page_no, data) in PageHdrIter :: new ( page_headers, page_size as _ ) {
66
- self . write_frame ( page_no, data)
67
- }
68
-
69
- if is_commit != 0 {
70
- self . commit ( ntruncate) ;
87
+ if is_commit != 0 && rc == 0 {
88
+ if let Some ( ( count, checksum) ) = commit_info {
89
+ self . commit ( count, checksum) ;
90
+ }
71
91
}
72
92
73
93
rc
@@ -138,10 +158,21 @@ impl ReplicationLoggerHook {
138
158
self . buffer . push ( entry) ;
139
159
}
140
160
141
- fn commit ( & mut self , size_after : u32 ) {
142
- self . buffer . last_mut ( ) . unwrap ( ) . size_after = size_after;
143
- self . logger . push_page ( & self . buffer ) ;
144
- self . buffer . clear ( ) ;
161
+ /// write buffered pages to the logger, without commiting.
162
+ /// Returns the attempted count and checksum, that need to be passed to `commit`
163
+ fn flush ( & mut self , size_after : u32 ) -> anyhow:: Result < Option < ( u64 , u64 ) > > {
164
+ if !self . buffer . is_empty ( ) {
165
+ self . buffer . last_mut ( ) . unwrap ( ) . size_after = size_after;
166
+ let ret = self . logger . write_pages ( & self . buffer ) ?;
167
+ self . buffer . clear ( ) ;
168
+ Ok ( Some ( ret) )
169
+ } else {
170
+ Ok ( None )
171
+ }
172
+ }
173
+
174
+ fn commit ( & self , new_count : u64 , new_checksum : u64 ) {
175
+ self . logger . commit ( new_count, new_checksum)
145
176
}
146
177
147
178
fn rollback ( & mut self ) {
@@ -161,10 +192,14 @@ struct LogFileHeader {
161
192
db_id : u128 ,
162
193
/// Frame index of the first frame in the log
163
194
start_frame_id : u64 ,
195
+ /// entry count in file
196
+ frame_count : u64 ,
164
197
/// Wal file version number, currently: 1
165
198
version : u32 ,
166
199
/// page size: 4096
167
200
page_size : i32 ,
201
+ /// 0 padding for alignment
202
+ _pad : u64 ,
168
203
}
169
204
170
205
impl LogFileHeader {
@@ -179,6 +214,18 @@ impl LogFileHeader {
179
214
fn encode < B : BufMut > ( & self , mut buf : B ) {
180
215
buf. put ( & cast_ref :: < _ , [ u8 ; size_of :: < Self > ( ) ] > ( self ) [ ..] ) ;
181
216
}
217
+
218
+ /// Returns the bytes position of the `nth` entry in the log
219
+ fn absolute_byte_offset ( nth : u64 ) -> u64 {
220
+ std:: mem:: size_of :: < Self > ( ) as u64 + nth * ReplicationLogger :: FRAME_SIZE as u64
221
+ }
222
+
223
+ fn byte_offset ( & self , id : FrameId ) -> Option < u64 > {
224
+ if id < self . start_frame_id || id > self . start_frame_id + self . frame_count {
225
+ return None ;
226
+ }
227
+ Self :: absolute_byte_offset ( id - self . start_frame_id ) . into ( )
228
+ }
182
229
}
183
230
184
231
/// The file header for the WAL log. All fields are represented in little-endian ordering.
@@ -222,14 +269,10 @@ impl Generation {
222
269
}
223
270
224
271
pub struct ReplicationLogger {
225
- /// offset id of the next Frame to write into the log
226
- next_frame_id : Mutex < FrameId > ,
227
- /// first index present in the file
228
- start_frame_id : FrameId ,
229
- log_file : File ,
272
+ log_header : Mutex < LogFileHeader > ,
230
273
current_checksum : AtomicU64 ,
231
- pub database_id : Uuid ,
232
274
pub generation : Generation ,
275
+ log_file : File ,
233
276
}
234
277
235
278
impl ReplicationLogger {
@@ -244,7 +287,6 @@ impl ReplicationLogger {
244
287
. read ( true )
245
288
. open ( path) ?;
246
289
let file_end = log_file. metadata ( ) ?. len ( ) ;
247
- let end_id;
248
290
let current_checksum;
249
291
250
292
let header = if file_end == 0 {
@@ -256,39 +298,43 @@ impl ReplicationLogger {
256
298
page_size : WAL_PAGE_SIZE ,
257
299
start_checksum : 0 ,
258
300
db_id : db_id. as_u128 ( ) ,
301
+ frame_count : 0 ,
302
+ _pad : 0 ,
259
303
} ;
260
304
261
305
let mut header_buf = BytesMut :: new ( ) ;
262
306
header. encode ( & mut header_buf) ;
307
+ current_checksum = AtomicU64 :: new ( 0 ) ;
263
308
264
309
assert_eq ! ( header_buf. len( ) , std:: mem:: size_of:: <LogFileHeader >( ) ) ;
265
310
266
311
log_file. write_all ( & header_buf) ?;
267
- end_id = 0 ;
268
- current_checksum = AtomicU64 :: new ( 0 ) ;
269
312
header
270
313
} else {
271
314
let mut header_buf = BytesMut :: zeroed ( size_of :: < LogFileHeader > ( ) ) ;
272
315
log_file. read_exact ( & mut header_buf) ?;
273
316
let header = LogFileHeader :: decode ( & header_buf) ?;
274
- end_id = ( file_end - size_of :: < FrameHeader > ( ) as u64 ) / Self :: FRAME_SIZE as u64 ;
275
317
current_checksum = AtomicU64 :: new ( Self :: compute_checksum ( & header, & log_file) ?) ;
276
318
header
277
319
} ;
278
320
279
321
Ok ( Self {
280
- next_frame_id : Mutex :: new ( end_id) ,
281
- start_frame_id : header. start_frame_id ,
282
- log_file,
283
322
current_checksum,
284
- database_id : Uuid :: from_u128 ( header. db_id ) ,
285
- generation : Generation :: new ( end_id) ,
323
+ generation : Generation :: new ( header. start_frame_id ) ,
324
+ log_header : Mutex :: new ( header) ,
325
+ log_file,
286
326
} )
287
327
}
288
328
289
- fn push_page ( & self , pages : & [ WalPage ] ) {
290
- let mut lock = self . next_frame_id . lock ( ) ;
291
- let mut current_offset = * lock;
329
+ pub fn database_id ( & self ) -> Uuid {
330
+ Uuid :: from_u128 ( self . log_header . lock ( ) . db_id )
331
+ }
332
+
333
+ /// Write pages to the log, without updating the file header.
334
+ /// Returns the new frame count and checksum to commit
335
+ fn write_pages ( & self , pages : & [ WalPage ] ) -> anyhow:: Result < ( u64 , u64 ) > {
336
+ let log_header = { * self . log_header . lock ( ) } ;
337
+ let mut current_frame = log_header. frame_count ;
292
338
let mut buffer = BytesMut :: with_capacity ( Self :: FRAME_SIZE ) ;
293
339
let mut current_checksum = self . current_checksum . load ( Ordering :: Relaxed ) ;
294
340
for page in pages. iter ( ) {
@@ -298,7 +344,7 @@ impl ReplicationLogger {
298
344
let checksum = digest. finalize ( ) ;
299
345
300
346
let header = FrameHeader {
301
- frame_id : current_offset ,
347
+ frame_id : log_header . start_frame_id + current_frame ,
302
348
checksum,
303
349
page_no : page. page_no ,
304
350
size_after : page. size_after ,
@@ -311,58 +357,45 @@ impl ReplicationLogger {
311
357
312
358
frame. encode ( & mut buffer) ;
313
359
314
- self . log_file
315
- . write_all_at (
316
- & buffer,
317
- self . byte_offset ( current_offset)
318
- . expect ( "attempt to write entry before first entry in the log" ) ,
319
- )
320
- . unwrap ( ) ;
360
+ let byte_offset = LogFileHeader :: absolute_byte_offset ( current_frame) ;
361
+ tracing:: trace!( "writing frame {current_frame} at offset {byte_offset}" ) ;
362
+ self . log_file . write_all_at ( & buffer, byte_offset) ?;
321
363
322
- current_offset += 1 ;
364
+ current_frame += 1 ;
323
365
current_checksum = checksum;
324
366
325
367
buffer. clear ( ) ;
326
368
}
327
369
328
- self . current_checksum
329
- . store ( current_checksum , Ordering :: Relaxed ) ;
330
-
331
- * lock = current_offset ;
370
+ Ok ( (
371
+ log_header . frame_count + pages . len ( ) as u64 ,
372
+ current_checksum ,
373
+ ) )
332
374
}
333
375
334
376
/// Returns bytes represening a WalFrame for frame `id`
335
377
///
336
378
/// If the requested frame is before the first frame in the log, or after the last frame,
337
379
/// Ok(None) is returned.
338
380
pub fn frame_bytes ( & self , id : FrameId ) -> anyhow:: Result < Option < Bytes > > {
339
- if id < self . start_frame_id {
381
+ let header = { * self . log_header . lock ( ) } ;
382
+ if id < header. start_frame_id {
340
383
return Ok ( None ) ;
341
384
}
342
385
343
- if id >= * self . next_frame_id . lock ( ) {
386
+ if id >= header . start_frame_id + header . frame_count {
344
387
return Ok ( None ) ;
345
388
}
346
389
347
390
let mut buffer = BytesMut :: zeroed ( Self :: FRAME_SIZE ) ;
348
391
self . log_file
349
- . read_exact_at ( & mut buffer, self . byte_offset ( id) . unwrap ( ) ) ?;
392
+ . read_exact_at ( & mut buffer, header. byte_offset ( id) . unwrap ( ) ) ?; // unwrap: we checked
393
+ // that the frame index
394
+ // in in the file before
350
395
351
396
Ok ( Some ( buffer. freeze ( ) ) )
352
397
}
353
398
354
- /// Returns the bytes position of the `nth` entry in the log
355
- fn absolute_byte_offset ( nth : u64 ) -> u64 {
356
- std:: mem:: size_of :: < LogFileHeader > ( ) as u64 + nth * ReplicationLogger :: FRAME_SIZE as u64
357
- }
358
-
359
- fn byte_offset ( & self , id : FrameId ) -> Option < u64 > {
360
- if id < self . start_frame_id {
361
- return None ;
362
- }
363
- Self :: absolute_byte_offset ( id - self . start_frame_id ) . into ( )
364
- }
365
-
366
399
/// Returns an iterator over the WAL frame headers
367
400
fn frames_iter (
368
401
file : & File ,
@@ -383,7 +416,7 @@ impl ReplicationLogger {
383
416
let mut current_offset = 0 ;
384
417
385
418
Ok ( std:: iter:: from_fn ( move || {
386
- let read_offset = Self :: absolute_byte_offset ( current_offset) ;
419
+ let read_offset = LogFileHeader :: absolute_byte_offset ( current_offset) ;
387
420
if read_offset >= file_len {
388
421
return None ;
389
422
}
@@ -407,6 +440,19 @@ impl ReplicationLogger {
407
440
Ok ( cs)
408
441
} )
409
442
}
443
+
444
+ fn commit ( & self , new_frame_count : u64 , new_current_checksum : u64 ) {
445
+ let mut header = { * self . log_header . lock ( ) } ;
446
+ header. frame_count = new_frame_count;
447
+
448
+ self . log_file
449
+ . write_all_at ( bytes_of ( & header) , 0 )
450
+ . expect ( "fatal error, failed to commit to log" ) ;
451
+
452
+ self . current_checksum
453
+ . store ( new_current_checksum, Ordering :: Relaxed ) ;
454
+ * self . log_header . lock ( ) = header;
455
+ }
410
456
}
411
457
412
458
#[ cfg( test) ]
@@ -418,24 +464,24 @@ mod test {
418
464
let dir = tempfile:: tempdir ( ) . unwrap ( ) ;
419
465
let logger = ReplicationLogger :: open ( dir. path ( ) ) . unwrap ( ) ;
420
466
421
- assert_eq ! ( * logger. next_frame_id. lock( ) , 0 ) ;
422
-
423
467
let frames = ( 0 ..10 )
424
468
. map ( |i| WalPage {
425
469
page_no : i,
426
470
size_after : 0 ,
427
471
data : Bytes :: from ( vec ! [ i as _; 4096 ] ) ,
428
472
} )
429
473
. collect :: < Vec < _ > > ( ) ;
430
- logger. push_page ( & frames) ;
474
+ let ( count, chk) = logger. write_pages ( & frames) . unwrap ( ) ;
475
+ logger. commit ( count, chk) ;
431
476
432
477
for i in 0 ..10 {
433
478
let frame = WalFrame :: decode ( logger. frame_bytes ( i) . unwrap ( ) . unwrap ( ) ) . unwrap ( ) ;
434
479
assert_eq ! ( frame. header. page_no, i as u32 ) ;
435
480
assert ! ( frame. data. iter( ) . all( |x| i as u8 == * x) ) ;
436
481
}
437
482
438
- assert_eq ! ( * logger. next_frame_id. lock( ) , 10 ) ;
483
+ let header = { * logger. log_header . lock ( ) } ;
484
+ assert_eq ! ( header. start_frame_id + header. frame_count, 10 ) ;
439
485
}
440
486
441
487
#[ test]
@@ -455,6 +501,8 @@ mod test {
455
501
size_after : 0 ,
456
502
data : vec ! [ 0 ; 3 ] . into ( ) ,
457
503
} ;
458
- logger. push_page ( & [ entry] ) ;
504
+
505
+ let ( count, chk) = logger. write_pages ( & [ entry] ) . unwrap ( ) ;
506
+ logger. commit ( count, chk) ;
459
507
}
460
508
}
0 commit comments