Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 26 additions & 4 deletions include/compio/file.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,17 +185,29 @@ struct index_node : public infile_object {
*
*/
struct storage_block : public infile_object {
static constexpr uint8_t signature = 171; // FNV-1a
static constexpr uint8_t signature_crc32c = 172; // CRC32C
static constexpr uint8_t signature = 171; // FNV-1a (v1, no back-ref)
static constexpr uint8_t signature_crc32c = 172; // CRC32C (v1, no back-ref)
// v2 self-describing blocks: carry their owning {hash,pos} key, enabling
// index reconstruction from blocks alone after partial/total index loss.
static constexpr uint8_t signature_backref = 173; // FNV-1a (v2, back-ref)
static constexpr uint8_t signature_backref_crc32c = 174; // CRC32C (v2, back-ref)

uint8_t is_compressed; /**< Is this block compressed */
uint64_t size; /**< Size of data array */
uint64_t original_size; /**< Original size (size of uncompressed data) */
std::unique_ptr<uint8_t[]> data; /**< Data block */
uint32_t checksum; /**< Checksum (4 bytes) */

compio_checksum_type checksum_type; /**< Algorithm used for checksum */

/** @brief Owning B-tree key (v2 only). Written into the block, read back on
* recovery. has_backref tells whether src_key is present on disk. */
tree_key src_key{};
bool has_backref = false;

/** @brief On-disk metadata size implied by a block signature byte */
static uint64_t meta_size_for(uint8_t sig);

storage_block();

storage_block(std::unique_ptr<uint8_t[]> &&data, uint64_t size);
Expand Down Expand Up @@ -224,16 +236,26 @@ struct storage_block : public infile_object {
* @return true if checksum matches, false otherwise
*/
bool verify_checksum() const;

/** @brief Compute checksum over (back-ref key if v2) + data */
uint32_t compute_checksum() const;
};

/**
* @brief Size of storage block metadata (without data)
* @brief Size of v1 storage block metadata (without data, no back-ref)
*/
#define STORAGE_BLOCK_METASIZE \
(sizeof(uint8_t) /* signature */ + sizeof(storage_block::is_compressed) + \
sizeof(storage_block::size) + sizeof(storage_block::original_size) + \
sizeof(uint32_t) /* FNV-1a 32-bit checksum */)

/**
* @brief Size of v2 storage block metadata (back-ref {hash,pos} added).
* New blocks are always written in v2 layout.
*/
#define STORAGE_BLOCK_METASIZE_V2 \
(STORAGE_BLOCK_METASIZE + sizeof(tree_key) /* hash + pos */)

} // namespace compio

#endif // COMPIO_FILE_H_
10 changes: 10 additions & 0 deletions include/compio/storage_block_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ class block {
bool _is_modified;
bool _is_removed;
bool _is_valid;
/** @brief On-disk metadata size of the block as read (v1=22, v2=38), so
* deallocation frees the exact on-disk footprint. */
uint64_t _disk_meta_size = STORAGE_BLOCK_METASIZE_V2;

public:
/**
Expand Down Expand Up @@ -259,6 +262,13 @@ class block {
* @return Size of the block in bytes when compressed (0 if not yet compressed)
*/
uint64_t c_size() const;

/**
* @brief Get the on-disk metadata size of this block (v1=22, v2=38)
*
* @return Byte size of the block's on-disk metadata header
*/
uint64_t disk_meta_size() const;
};

/**
Expand Down
16 changes: 14 additions & 2 deletions src/allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1031,9 +1031,16 @@ void block_allocator::perform_defragmentation() {
continue;
}

// Read actual compressed size from on-disk metadata (signature + is_compressed + size).
// Read signature + compressed size from on-disk metadata. The signature
// determines the metadata footprint (v1=22, v2=38 with {hash,pos} back-ref).
uint64_t compressed_size = 0;
uint8_t sig = 0;
{
if (fseek64(archive_->file, static_cast<int64_t>(src), SEEK_SET) != 0 ||
fread(&sig, 1, 1, archive_->file) != 1) {
WARNING_PRINT("warning: perform_defragmentation: failed to read signature at %" PRIu64 "\n", src);
continue;
}
const int64_t meta_offset = static_cast<int64_t>(src) + 2;
if (fseek64(archive_->file, meta_offset, SEEK_SET) != 0 ||
lendian_fread(&compressed_size, sizeof(compressed_size), 1, archive_->file) != 1 ||
Expand All @@ -1042,7 +1049,12 @@ void block_allocator::perform_defragmentation() {
continue;
}
}
const uint64_t block_size = STORAGE_BLOCK_METASIZE + compressed_size;
const uint64_t meta_size = storage_block::meta_size_for(sig);
if (meta_size == 0) {
WARNING_PRINT("warning: perform_defragmentation: bad signature %u at %" PRIu64 "\n", sig, src);
continue;
}
const uint64_t block_size = meta_size + compressed_size;

// Ensure write_pos doesn't overlap with ANY reserved region (Files Table or B-tree nodes)
while (true) {
Expand Down
66 changes: 57 additions & 9 deletions src/compio.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,8 @@ int compio_remove_file(compio_archive *archive, const char *name) {
// Deallocate. Maintenance is suspended globally, so this won't trigger defrag.
// Use overloaded deallocate with explicit false for perform_maintenance, although
// suspended state also prevents it.
archive->allocator->deallocate(val.addr, STORAGE_BLOCK_METASIZE + sb.size, false);
const uint64_t meta = sb.has_backref ? STORAGE_BLOCK_METASIZE_V2 : STORAGE_BLOCK_METASIZE;
archive->allocator->deallocate(val.addr, meta + sb.size, false);
} else {
WARNING_PRINT("warning: failed to read block at %" PRIu64 " during removal\n", val.addr);
if (saved_pos >= 0) {
Expand Down Expand Up @@ -2422,9 +2423,18 @@ int compio_repair(const char *path, const char *output_dir) {
uint64_t pos;
uint64_t addr;
uint64_t size;
bool from_backref;
};
std::map<uint64_t, std::vector<file_part>> index_files;

// Addresses referenced by surviving index nodes, and back-refs harvested
// directly from v2 self-describing blocks. After the scan, any v2 block
// whose address no surviving node references is re-attributed via its
// own {hash,pos}, recovering data whose index node was lost.
std::set<uint64_t> indexed_addrs;
struct block_backref { uint64_t hash, pos, addr, size; };
std::vector<block_backref> block_backrefs;

constexpr size_t BUFFER_SIZE = 1024 * 1024;
std::vector<uint8_t> buffer(BUFFER_SIZE);

Expand All @@ -2445,33 +2455,53 @@ int compio_repair(const char *path, const char *output_dir) {
for (size_t k = 0; k < node.keys.size(); ++k) {
if (k < node.values.size()) {
index_files[node.keys[k].hash].push_back({
node.keys[k].pos,
node.values[k].addr,
node.values[k].size
node.keys[k].pos,
node.values[k].addr,
node.values[k].size,
false
});
indexed_addrs.insert(node.values[k].addr);
}
}
}
fseek64(f, saved_pos, SEEK_SET);
}

if (sig == storage_block::signature || sig == storage_block::signature_crc32c) {

if (sig == storage_block::signature || sig == storage_block::signature_crc32c ||
sig == storage_block::signature_backref || sig == storage_block::signature_backref_crc32c) {
uint64_t saved_pos = ftell64(f);
storage_block sb;
if (sb.read_from(f, current_addr)) {
discovered_blocks[current_addr] = {
current_addr,
sb.size,
sb.original_size,
current_addr,
sb.size,
sb.original_size,
(bool)sb.is_compressed
};
if (sb.has_backref) {
block_backrefs.push_back({sb.src_key.hash, sb.src_key.pos,
current_addr, sb.original_size});
}
}
fseek64(f, saved_pos, SEEK_SET);
}
}
offset += bytes_read;
}

// Re-attribute v2 blocks whose owning index node was lost: if no surviving
// node references the block's address, trust its self-describing back-ref.
int reattributed = 0;
for (const auto &br : block_backrefs) {
if (indexed_addrs.find(br.addr) == indexed_addrs.end()) {
index_files[br.hash].push_back({br.pos, br.addr, br.size, true});
++reattributed;
}
}
if (reattributed > 0) {
WARNING_PRINT("info: re-attributed %d orphan block(s) via self-describing back-refs\n", reattributed);
}

compio_compressor compressor;
compio_build_compressor_by_type(&compressor, (compio_compression_type)h.compression_type);

Expand Down Expand Up @@ -2507,6 +2537,24 @@ int compio_repair(const char *path, const char *output_dir) {
return a.pos < b.pos;
});

// Files rebuilt entirely from self-describing back-refs may carry stale
// absolute positions (lazy add_to_range shifts don't rewrite evicted
// blocks). The logical stream is densely tiled, so the correct offset of
// each block is the running sum of preceding block sizes; sort order is
// preserved by every shift, so re-deriving positions this way recovers
// the file regardless of drift. Only applied when no surviving index
// node anchors any part (a partial index would still hold true offsets).
const bool all_from_backref = !parts.empty() &&
std::all_of(parts.begin(), parts.end(),
[](const auto& p) { return p.from_backref; });
if (all_from_backref) {
uint64_t running = 0;
for (auto& part : parts) {
part.pos = running;
running += part.size;
}
}

std::string filename;
if (hash_to_name.count(hash)) {
filename = hash_to_name[hash];
Expand Down
Loading
Loading