add tests and documentation

ekexium · Sep 23, 2019 · ba2fe2f · ba2fe2f
1 parent de34d70
commit ba2fe2f
Show file tree

Hide file tree

Showing 11 changed files with 470 additions and 110 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,42 @@
 # KVIndex
-A simple B+Tree based clustered Index implementation for Key-Value storage.
+
+[![MIT Licence](https://badges.frapsoft.com/os/mit/mit.svg?v=103)](https://opensource.org/licenses/mit-license.php)
+
+A simple hash index implementation for random-read-only key-value storage.
+
+## Usage
+
+1. Call `KVIndex.initialize(filename)` to create index and initialize.
+2. Concurrently call `KVIndex.get()` to query.
+
+## Benchmark
+
+Platform: 2.4GHz 2-core CPU, 16 GB RAM, 512 GB APPLE SSD
+
+![](doc/benchmark.png)
+
+The benchmark shows that the number of `N` has little effect on query performance. However, due to the bottleneck of disk I/O, multithreading can hardly increase query performance.
+
+
+## Implementation
+
+The project implements a hash index for query-only key-value storage.
+
+All indexes are stored in the disk. There are totally (by default) 512 index files.
+
+An index file consists of several slots, each slot is (by default) 13 bytes, containing key_size, address, value_size, and next_slot_id. Collisions are handled with linked lists, where next_slot_id is used.
+
+A query first calculates the hashcode of the key. Secondly, the address of the corresponding record is retrieved from the index file. At last, read the value from the data file and return it. The second and third steps may repeat some times if there are hash collisions. The amortized number of disk accesses is 2.
+
+## Future work
+
+There are a few major factors that can be optimized to improve performance.
+
+1. [ ] better hash function
+2. [ ] memory buffer
+3. [ ] parallel initialization
+4. [ ] I/O optimization
+
+## Contact
+
+Please e-mail me to [email protected] for any suggestions.
diff --git a/build.gradle b/build.gradle
@@ -14,6 +14,7 @@ repositories {
 dependencies {
 //    testCompile group: 'junit', name: 'junit', version: '4.12'
     testCompile 'org.junit.jupiter:junit-jupiter:5.5.2'
+    testCompile 'org.apache.commons:commons-lang3:3.0'
 }
 
 test {

diff --git a/doc/benchmark.png b/doc/benchmark.png
diff --git a/src/main/java/HashFunc.java b/src/main/java/HashFunc.java
@@ -1,12 +1,11 @@
 import java.nio.ByteBuffer;
 
+/**
+ * A DJB hash function that maps key(bytes[], <= 4096 bytes) to an address(long).
+ *
+ * The capacity is set to the nearest upper 2^k to (N / preferred_load_factor).
+ */
 class HashFunc {
-    /**
-     * A DJB hash function that maps key(bytes[], <= 4096 bytes) to an address(long).
-     *
-     * The capacity is set to the nearest upper 2^k to (N / preferred_load_factor).
-     */
-
     long N;                 // size of the set of keys
     long capacity;          // capacity of slots
     int loadFactorInv = 2;  // the reciprocal of preferred load factor
@@ -34,6 +33,14 @@ class HashFunc {
         Log.logi("Hash capacity = " + capacity);
     }
 
+    /**
+     * DJB hash function.
+     * Thread-safe.
+     *
+     * @param key
+     *        Key of hash function
+     * @return The hashcode of key
+     */
     long hash(byte[] key) {
         long hash = 5381;
 

diff --git a/src/main/java/KVIndex.java b/src/main/java/KVIndex.java
@@ -5,30 +5,51 @@
 import java.nio.ByteBuffer;
 import java.util.Arrays;
 
+/**
+ * The main class of KVIndex using hash indexing.
+ *
+ * Input:
+ *  A binary data file consisting of records.
+ *  Each record is of format (key_size, key, value_size, value).
+ *
+ * Output:
+ *  get(key) returns the corresponding value.
+ *
+ * Implementation:
+ *  During initialization, create index files for all records.
+ *  Each files consists of several slots.
+ *
+ *  Default slot structure:
+ *  | key_size | address | value_size | next_slot_id |
+ *  |    2     |    5    |      2     |       4      |
+ *
+ *  address indicates the address of the original record in the data file.
+ *
+ *  key_size can be used to reduce unnecessary checks.
+ *
+ *  key_size and value_size can be used to reduce unnecessary disk accesses.
+ *
+ *  Use linked list to handle collisions.
+ *  next_slot_id indicates the id of the next slot in the linked list,
+ *  whose address = slot_size(11) * next_slot_id.
+ *
+ * Indexing:
+ *  hash() : key -> hashCode
+ *  hashCode: h bits
+ *  First (h - f) bits are used for in-file index.
+ *  Last f bits are used for file index, i.e. there are 2^f index files.
+ *
+ *  Example:
+ *  h = 24, f = 8
+ *  fileIdMask:      0x000000ff
+ *  infileIndexMask: 0x00ffff00
+ */
 public class KVIndex {
-    /**
-     * hash() : key -> hashCode
-     * hashCode: h bits
-     * First (h - f) bits are used for in-file index.
-     * Last f bits are used for file index, i.e. there are 2^f index files.
-     * <p>
-     * Example:
-     * h = 24, f = 8
-     * fileIdMask:      0x000000ff
-     * infileIndexMask: 0x00ffff00
-     */
-    final int f = 8;
-    long fileIdMask;
-    long infileIndexMask;
-
-    /**
-     * size of each slot in index files
-     * next_index_id use 4 bytes, since h - f = 40 - 8 = 32 bits
-     * i.e. there should be at most 2^31 -1 indexes in the file
-     * key_size | key_pos | next_index_id
-     * 2        |    5    |      4
-     */
+    final int f = 8;        // # of bits used for file id
+    long fileIdMask;        // bitwise mask for file id
+    long infileIndexMask;   // bitwise mask for in-file index
 
+    // constants used to specify the format of index slots
     private static final int addrLength = 5;
     private static final int infilePointerLength = 4;
     static int slotSize = Record.keySizeLength + addrLength
@@ -43,20 +64,27 @@ public class KVIndex {
     // original data file
     RandomAccessFile dataFile;
 
-    long N; // number of key-value pairs
-
-    long collisionCount = 0;
+    // number of key-value pairs
+    long N;
 
+    // hash function
     HashFunc hasher;
 
-    public static void main(String[] args) {
-        System.out.println("Hello PingCAP");
-    }
-
     KVIndex() {
         System.out.println("Hello PingCAP");
     }
 
+    /**
+     * Creates index to get ready for queries.
+     *
+     * @param filename
+     *        The filename of data.
+     *
+     * @throws IOException
+     *         If I/O errors occur.
+     * @throws InvalidDataFormatException
+     *         If the data file has invalid format.
+     */
     public void initialize(String filename)
             throws IOException, InvalidDataFormatException {
         N = countEntry(filename);
@@ -68,6 +96,17 @@ public void initialize(String filename)
         dataFile = new RandomAccessFile(filename, "r");
     }
 
+    /**
+     * Thread-safe query function that returns the value corresponding to the given key.
+     *
+     * @param key
+     *        Key of the query.
+     *
+     * @return The value.
+     *
+     * @throws UninitializedException
+     *         If the KVIndex object has not been initialized.
+     */
     synchronized public byte[] get(byte[] key) throws UninitializedException {
         if (hasher == null)
             throw new UninitializedException("KVIndex has not been initialized");
@@ -85,7 +124,8 @@ synchronized public byte[] get(byte[] key) throws UninitializedException {
 
                 while (true) {
                     if (slotSize * infileIndex < 0) {
-                        Log.loge("seek offset < 0");
+                        Log.logi("seek offset < 0");
+                        return null;
                     }
                     indexFile.seek(slotSize * infileIndex);
                     indexFile.read(slotArr);
@@ -112,7 +152,6 @@ synchronized public byte[] get(byte[] key) throws UninitializedException {
                             // find the key-value
                             // retrieve and return value
                             short valueSize = buf.getShort(Record.keySizeLength + addrLength);
-                            Log.logd("[get] value size = " + valueSize);
                             byte[] value = new byte[valueSize];
                             dataFile.seek(address + Record.keySizeLength
                                           + keySize + Record.valueSizeLength);
@@ -126,8 +165,9 @@ synchronized public byte[] get(byte[] key) throws UninitializedException {
                     infileIndex = buf.getInt(Record.keySizeLength
                                              + addrLength + Record.valueSizeLength);
 
-                    if (infileIndex < 0)
-                        Log.loge("infileIndex < 0");
+                    if (infileIndex <= 0) {
+                        return null;
+                    }
                 }
             }
         } catch (IOException e) {
@@ -136,6 +176,19 @@ synchronized public byte[] get(byte[] key) throws UninitializedException {
         }
     }
 
+    /**
+     * Counts the total number of records.
+     *
+     * @param filename
+     *        The filename of data.
+     *
+     * @return The number of records in the data
+     *
+     * @throws IOException
+     *         If I/O errors occur
+     * @throws InvalidDataFormatException
+     *         If the data file has invalid format.
+     */
     long countEntry(String filename) throws IOException, InvalidDataFormatException {
         long rt = 0;
         RecordReader reader = new RecordReader(filename);
@@ -147,7 +200,7 @@ long countEntry(String filename) throws IOException, InvalidDataFormatException
     }
 
     /**
-     * Create empty index files.
+     * Creates empty index files.
      */
     void createIndexFile() {
         try {
@@ -174,9 +227,19 @@ void createIndexFile() {
         }
     }
 
+    /**
+     * Creates index for every record.
+     *
+     * @param filename The filename of data
+     */
     private void createIndex(String filename) {
         try {
+            Log.logi("Begin creating index.");
+            long startTime = System.currentTimeMillis();
             RecordReader reader = new RecordReader(filename);
+
+            // open data file to check replicated key
+
             while (reader.hasNextRecord()) {
                 // get a record
                 Record record = reader.getNextRecord(true);
@@ -201,15 +264,14 @@ private void createIndex(String filename) {
                     // hash collision, need to add new slot
                     // temporarily store the address of next slot
                     // skip the key_position and value_size field
-                    collisionCount++;
-
                     indexFile.skipBytes(addrLength + Record.valueSizeLength);
                     byte[] nextPos = new byte[infilePointerLength];
                     indexFile.read(nextPos);
 
                     // set the pointer to the next slot to the end, where new record is written
                     indexFile.seek(slotSize * infileIndex
-                                   + Record.keySizeLength + addrLength + Record.valueSizeLength);
+                                   + Record.keySizeLength + addrLength +
+                                   Record.valueSizeLength);
                     indexFile.writeInt((int) (indexFile.length() / slotSize));
 
                     // append the file
@@ -218,25 +280,50 @@ private void createIndex(String filename) {
                 }
                 indexFile.close();
             }
+            Log.logi("Index created, used " + (System.currentTimeMillis() - startTime) + "ms.");
         } catch (IOException | InvalidDataFormatException e) {
             Log.loge("Failed to create index: " + e.getMessage());
             e.printStackTrace();
         }
     }
 
+    /**
+     * Calculates the masks for file id and infile index.
+     */
     void calculateMask() {
         fileIdMask = (1 << f) - 1;
         infileIndexMask = (hasher.capacity - 1) ^ fileIdMask;
     }
 
+    /**
+     * Returns the index file name
+     *
+     * @param fileId
+     *        The id of the index file
+     *
+     * @return The filename
+     */
     private String getIndexFilePath(int fileId) {
         return indexPath + File.separator + indexFilenamePrefix + fileId + indexFilenamePostfix;
     }
 
-    private void writeSlot(RandomAccessFile indexFile, Record record, byte[] nextPos)
+    /**
+     * Writes a record to the current position of indexFile.
+     *
+     * @param indexFile
+     *        File with position
+     * @param record
+     *        The record to be written
+     * @param nextSlotId
+     *        The next slot id in the linked list
+     *
+     * @throws IOException
+     *         If I/O errors occur.
+     */
+    private void writeSlot(RandomAccessFile indexFile, Record record, byte[] nextSlotId)
             throws IOException {
-        if (nextPos.length != infilePointerLength)
-            throw new IllegalArgumentException("Length of nextPos must be 4");
+        if (nextSlotId.length != infilePointerLength)
+            throw new IllegalArgumentException("Length of nextSlotId must be 4");
 
         Log.logd("--------writeslot--------");
         Log.logd("key = " + Arrays.toString(record.key));
@@ -259,11 +346,14 @@ private void writeSlot(RandomAccessFile indexFile, Record record, byte[] nextPos
         indexFile.writeShort(record.valueSize);
 
         // position of next slot if there is hash collision
-        indexFile.write(nextPos);
+        indexFile.write(nextSlotId);
         Log.logd("-------\\writeslot--------");
     }
 }
 
+/**
+ * The exception class for queries before initialization.
+ */
 class UninitializedException extends Exception {
     UninitializedException() {
         super();

diff --git a/src/main/java/Log.java b/src/main/java/Log.java
@@ -1,3 +1,6 @@
+/**
+ * A utility class for logging
+ */
 public class Log {
     private static final boolean useDebug = false;
     private static final boolean useWarning = true;

diff --git a/src/main/java/Record.java b/src/main/java/Record.java
@@ -1,5 +1,10 @@
+/**
+ * The record class that stores info of a record and some metadata about records.
+ *
+ * By default, assume key size and value size <= 4KB
+ */
 public class Record {
-    // key size and value size <= 4KB
+
 
     static int keySizeLength = 2;         // length of the field key_size, by default 2 bytes
     static int valueSizeLength = 2;       // length of the field value_size, by default 2 bytes