diff --git a/.gitignore b/.gitignore
index 3094469..b884c82 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,113 @@
+# Rust specific
+/target/
+**/target/
+**/*.rs.bk
+Cargo.lock
+*.pdb
+
+# Protocol Buffers
+*.pb.h
+*.pb.cc
+*.pb.go
+*.pb.swift
+*.pb.dart
+*.pb.js
+*.pb.ts
+*.pb.rs
+
+# Generated Rust files
+/src/autocomplete_proto.rs
+/src/autocomplete_proto/*.rs
+
+# C++ specific
+*.o
+*.obj
+*.exe
+*.out
+*.app
+*.dll
+*.so
+*.dylib
+*.a
+*.lib
+*.d
+*.lo
+*.la
+*.lai
+*.Plo
+*.Pla
+*.l
+*.o
+*.obj
+*.elf
+*.bin
+*.hex
+*.map
+*.lst
+*.sym
+*.lss
+*.eep
+*.elf
+*.hex
+*.bin
+*.map
+*.lst
+*.sym
+*.lss
+*.eep
+*.elf
+*.hex
+*.bin
+*.map
+*.lst
+*.sym
+*.lss
+*.eep
+
+# Build directories
+/build/
+**/build/
+/debug_build/
+**/debug_build/
+/CMakeFiles/
+**/CMakeFiles/
+/CMakeCache.txt
+**/CMakeCache.txt
+/CMakeScripts/
+**/CMakeScripts/
+/Testing/
+**/Testing/
+/Makefile
+**/Makefile
+/cmake_install.cmake
+**/cmake_install.cmake
+/install_manifest.txt
+**/install_manifest.txt
+/compile_commands.json
+**/compile_commands.json
+/CTestTestfile.cmake
+**/CTestTestfile.cmake
+/_deps
+**/_deps
+/.cmake
+**/.cmake
+
+# IDE specific
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS specific
.DS_Store
-build
+Thumbs.db
+
+# Project specific
+*.mapped
+*.mapped.stats
+*.dict
+*.inverted
+*.forward
+*.bin
+target/
diff --git a/.gitmodules b/.gitmodules
index 72f21cd..5b9dc7e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,9 @@
[submodule "external/mongoose"]
path = external/mongoose
url = https://github.com/cesanta/mongoose.git
+[submodule "external/doctest"]
+ path = external/doctest
+ url = https://github.com/onqtam/doctest.git
+[submodule "external/cmd_line_parser"]
+ path = external/cmd_line_parser
+ url = https://github.com/jermp/cmd_line_parser.git
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..35abc20
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright 2019 Giulio Ermanno Pibiri
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/README.md b/README.md
index b1a0946..624670f 100644
--- a/README.md
+++ b/README.md
@@ -1,177 +1,238 @@
-Autocomplete
-------------
+# Autocomplete System
-Query autocompletion in C++.
+This repository contains an autocomplete system implementation. The original C++ implementation is being ported to Rust and will be containerized for easier deployment and testing.
-##### Table of contents
-1. [Description](#descr)
-2. [Compiling the code](#compiling)
-3. [Input data format](#input)
-4. [Benchmarks](#benchmarks)
-5. [Live demo](#demo)
+## Project Structure
-Description
------------
+- `autocomplete-rs/`: The Rust port of the original C++ implementation
+- `archive/`: Original C++ implementation and related files
+
+## Goals
+
+1. Port the C++ implementation to Rust while maintaining the same functionality
+2. Leverage Rust's safety guarantees and modern tooling
+3. Containerize the application using Docker for easy deployment and testing
+
+## Current Status
+
+The porting process is ongoing. The following components have been ported to Rust:
+
+- Basic constants and configuration
+- Parameters management
+- Performance measurement probes
+
+## Building and Testing
-We designed two solutions (`autocomplete.hpp` and `autocomplete2.hpp`).
-The second solution avoids storing the forward index of the first solution.
+### Original C++ Implementation
+```bash
+cd archive
+make
+```
-Both solution build on two steps: (1) a prefix search (`prefix_topk`) and (2) a conjunctive search (`conjunctive_topk`).
+### Rust Implementation
+```bash
+cd autocomplete-rs
+cargo build
+cargo test
+```
-Recall that each completion has an associated integer identifier (henceforth, called docID), assigned in *decreasing* score order.
+## License
-#### 1. Prefix search
+This project is licensed under the MIT License - see the LICENSE file for details.
-This step returns the top-k completions that are prefixed by the terms in the query.
-For this purposes, we build a dictionary storing all completions seen as (multi-) sets of termIDs.
-Solution 1 uses an integer trie data structure (`completion_trie.hpp`);
-Solution 2 uses Front Coding (`integer_fc_dictionary.hpp`).
-We also materialize the list L of docIDs sorted by the lexicographical order of the completions (`unsorted_list.hpp`).
+Autocomplete
+------------
+
+A Query Auto-Completion system based on the paper *[Efficient and Effective Query Auto-Completion](https://dl.acm.org/doi/10.1145/3397271.3401432)*, by Simon Gog, Giulio Ermanno Pibiri, and Rossano Venturini,
+published in ACM SIGIR 2020.
+
+Please, cite the paper if you use the data structures from this library.
+
+##### Table of contents
+1. [Installation and quick start](#install)
+2. [Compiling the code](#compiling)
+3. [Input data format](#input)
+4. [Running the unit tests](#testing)
+5. [Building an index](#building)
+6. [Benchmarks](#benchmarks)
+7. [Live demo](#demo)
+
+Installation and quick start
+------------------
-During a search, we first map the query terms to their lexicographic IDs by using a string dictionary (implemented as a 2-level index with Front Coding -- `fc_dictionary.hpp`). Then, we search the mapped query, say Q, into the completion trie to obtain the lexicographic range [l,r] of all completions that are children of Q. Then we need to identify the top-k docIDs from L[l,r]. Since the range [l,r] can be very large, we use a RMQ data structure built on L.
+Just run
-Having retrieved a list of (at most) k docIDs, we then:
+ bash ./install.sh
-1. Solution 1: use a forward index (`forward_index.hpp`) to materialize the identified completions into a string pool (`scored_string_pool.hpp`).
-The forward index stores the sorted (multi-) set of the termIDs of each completion, plus also the permutation of such termIDs in order to restore the original completion. The sets are stored in increasing-docID order.
-Specifically, we use the forward index to obtain the (permuted) set
-of termIDs and the string dictionary to extract the strings.
+from the parent directory. The script builds the code; prepare the test data in the folder `test_data/trec_05_efficiency_queries` for indexing; executes the unit tests.
-2. Solution 2: use a map from docIDs to lexicographic IDs. For every top-k docID, we extract the corresponding completion from the FC-based dictionary.
+After that, for having a minimal running example, just run
-#### 2. Conjunctive search
+ bash ./example.sh
-This step returns the top-k completions using an inverted index (`inverted_index.hpp`).
-For this purpose, let us consider a query Q[1,m] as tokenized into m terms (the last one possibly not completed).
-In this case we want to return the top-k (smallest) docIDs belonging
-to the intersection between the posting lists of the first m-1 terms
-and the union between all the postings lists of the terms that are
-prefixed by Q[m].
+and then access the service [from localhost](http://localhost:8000).
-To do so, we could trivially materialize the union and then proceed
-with the intersection.
-The clear problem with this approach is that the number of terms that are prefixed by Q[m] can be very large. Therefore iterating over the union can be overkilling.
+### Or you can use a prebuilt Docker image
-To solve this problem, we first obtain the lexicographic range of Q[m] by the string dictionary, say [l,r].
-We then iterate over the intersection of the first m-1 terms' posting lists and for each docID x we check whether the range [l,r] intersect the forward list of x. This check is done with the forward index.
-If the check succeeds, then x is among the top-k documents.
-We keep iterating over the intersection and checking the forward lists until we have k completions or we touch every docID in the intersection.
+The following command pulls a prebuilt Docker image and runs it locally.
-There is a special case for the case m = 1. In this case, we have no term before the last (only) one, thus we would check *all* forward lists for the range [l,r]. This is too expensive.
-Therefore, we use another RMQ data structure, built on the list, say M, of all the first (i.e., *minimal*) docIDs of the posting lists (think of it as the "first" column of the inverted index).
-A recursive heap-based algorithm is used to produce the smallest docIDs in M[l,r] using the RMQ data structure.
+ docker pull jermp/autocomplete
+ docker run -p 8000:8000 -d jermp/autocomplete
-The final string extraction step is identical to that of the
-prefix search.
+And then access the service [from localhost](http://localhost:8000).
Compiling the code
------------------
-The code is tested on Linux with `gcc` 7.4.0 and on Mac 10.14 with `clang` 10.0.0.
+The code has been tested on Linux with `gcc` 7.4.0, 8.3.0, 9.0.0, on Mac OS 10.14 and 12.4 with `clang` 10.0.0 and 13.0.0.
+
To build the code, [`CMake`](https://cmake.org/) is required.
Clone the repository with
- $ git clone --recursive https://github.com/jermp/autocomplete.git
+ git clone --recursive https://github.com/jermp/autocomplete.git
If you have cloned the repository without `--recursive`, you will need to perform the following commands before
compiling:
- $ git submodule init
- $ git submodule update
+ git submodule init
+ git submodule update
To compile the code for a release environment (see file `CMakeLists.txt` for the used compilation flags), it is sufficient to do the following:
- $ mkdir build
- $ cd build
- $ cmake ..
- $ make
+ mkdir build
+ cd build
+ cmake ..
+ make
-Hint: Use `make -j4` to compile the library in parallel using, e.g., 4 jobs.
+Hint: Use `make -j` to compile the library in parallel using all
+available threads.
For the best of performance, we recommend compiling with:
- $ `cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On`
+ cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On
For a testing environment, use the following instead:
- $ mkdir debug_build
- $ cd debug_build
- $ cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_SANITIZERS=On
- $ make
-
+ mkdir debug_build
+ cd debug_build
+ cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_SANITIZERS=On
+ make
+
Input data format
-----------------
The input file should list all completions in
*lexicographical* order.
-For example, see the the file `test_data/trec05_efficiency_queries/trec05_efficiency_queries.completions`.
+For example, see the the file `test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions`.
The first column represent the
ID of the completion; the other columns contain the
tokens separated by white spaces.
-(The IDs for the file `trec05_efficiency_queries.completions` are
+(The IDs for the file `trec_05_efficiency_queries.completions` are
fake, i.e., they do not take into account any
particular assignment.)
-The scripts in the directory `test_data` help in
-preparing the datasets for indexing:
+The script `preprocess.sh` in the directory `test_data` helps
+in preparing the data for indexing.
+Thus, from within the directory `test_data`, it is sufficient
+to do:
-1. The command
-
- $ extract_dict.py trec05_efficiency_queries/trec05_efficiency_queries.completions
-
- extract the dictionary
-from a file listing all completions in textual form.
+ bash preprocess.sh
-2. The command
+Therefore, for our example with `trec_05_efficiency_queries`, it would be:
- $ python map_dataset.py trec05_efficiency_queries/trec05_efficiency_queries.completions
-
- maps strings to integer ids.
+ bash preprocess.sh trec_05_efficiency_queries/trec_05_efficiency_queries.completions 300
-3. The command
+The second argument in the example, i.e., 300, represents the
+number of completions (per completion size) that are drawn at
+random and could be used to query the indexes.
- $ python build_stats.py trec05_efficiency_queries/trec05_efficiency_queries.completions.mapped
-
- calulcates the dataset statistics.
+If you run the script, you will get:
-4. The command
-
- $ python build_inverted_and_forward.py trec05_efficiency_queries/trec05_efficiency_queries.completions
-
- builds the inverted and forward files.
-
-If you run the scripts in the reported order, you will get:
-
-- `trec05_efficiency_queries.completions.dict`: lists all the distinct
+- `trec_05_efficiency_queries.completions.dict`: lists all the distinct
tokens in the completions sorted in lexicographical
order.
-- `trec05_efficiency_queries.completions.mapped`: lists all completions
+- `trec_05_efficiency_queries.completions.mapped`: lists all completions
whose tokens have been mapped to integer ids
as assigned by a lexicographically-sorted
string dictionary (that should be built from the
-tokens listed in `trec05_efficiency_queries.completions.dict`).
+tokens listed in `trec_05_efficiency_queries.completions.dict`).
Each completion terminates with the id `0`.
-- `trec05_efficiency_queries.completions.mapped.stats` contains some
+- `trec_05_efficiency_queries.completions.mapped.stats` contains some
statistics about the datasets, needed to build
the data structures more efficiently.
- `trec05_efficiency_queries.completions.inverted` is the inverted file.
-- `trec05_efficiency_queries.completions.forward` is the forward file. Note that each list is *not* sorted, thus the lists are the same as the ones contained in `trec05_efficiency_queries.completions.mapped` but sorted in docID order.
+- `trec_05_efficiency_queries.completions.forward` is the forward file. Note that each list is *not* sorted, thus the lists are the same as the ones contained in `trec_05_efficiency_queries.completions.mapped` but sorted in docID order.
+
+Running the unit tests
+-----------
+
+The unit tests are written using [doctest](https://github.com/onqtam/doctest).
+
+After compilation and preparation of the data for indexing (see Section [Input data format](#input)), it is advised
+to run the unit tests with:
+
+ make test
+
+Building an index
+-----------
+
+After compiling the code, run the program `./build` to build an index. You can specify the type of the index and the name of the file
+where the index will be written.
+
+For example, with
+
+ ./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec05.ef_type1.bin
+
+we can build an index of type `ef_type1` from the test file `../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions`, that will be serialized to the file `trec05.ef_type1.bin`.
+
+Possible types are `ef_type1`, `ef_type2`, `ef_type3` and `ef_type4`.
+
+Note: the type `ef_type4` requires an extra parameter
+to be specified, `c`. Use for example: `-c 0.0001`.
Benchmarks
----------
-Run `benchmark/benchmark_prefix_topk` and `benchmark/benchmark_conjunctive_topk`.
+To run the top-k benchmarks in the `/benchmark` directory,
+we first need some query logs.
+They should have been created already if you have run the
+script `preprocess.sh`, otherwise
+you can use
+
+ python3 partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries 300
+
+to partition the input completions by number of query terms
+and retain 300 queries at random.
+Query files are placed in the output directory
+`trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries`.
+(By default, 7 shards will be created: the ones having [1,6] query terms and
+the one collecting all completions with *at least* 7 query terms).
+
+Then the command
+
+ ./benchmark_topk ef_type1 10 trec05.ef_type1.bin 3 300 0.25 < ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries/queries.length=3.shuffled
+
+will execute 1000 top-10 queries with 3 terms, from which only 25%
+of the prefix of the last token is retained.
+
+We automated the collection of results with the script `script/collected_topk_results_by_varying_percentage.py`.
+From within the `/build` directory, run
+
+ python3 ../script/collect_results_by_varying_percentage.py ef_type1 topk trec05.ef_type1.bin ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions 10 300
+
+To benchmark the dictionaries (Front-Coding and trie), just run the following script from within
+the `script` directory:
-See the directory `results` for the results on the AOL and MSN query log.
+ bash benchmark_dictionaries.sh
Live demo
----------
Start the web server with the program `./web_server ` and access the demo at
-`localhost:`.
\ No newline at end of file
+`localhost:`.
diff --git a/TODO.md b/TODO.md
deleted file mode 100644
index 082ced9..0000000
--- a/TODO.md
+++ /dev/null
@@ -1,2 +0,0 @@
-
-- Study the effect of compression.
diff --git a/archive/.github/workflows/continuous_integration.yml b/archive/.github/workflows/continuous_integration.yml
new file mode 100644
index 0000000..bf625be
--- /dev/null
+++ b/archive/.github/workflows/continuous_integration.yml
@@ -0,0 +1,61 @@
+name: Continuous Integration
+
+on:
+ [ push,pull_request ]
+
+jobs:
+ build:
+ name: Continuous Integration
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ ubuntu-latest ]
+ steps:
+
+ - name: Checkout code
+ uses: actions/checkout@v2
+
+ - name: Checkout submodules
+ run: git submodule update --init --recursive
+
+ - name: Check cmake version
+ run: cmake --version
+
+ - name: Creating build directory
+ run: cmake -E make_directory ./build
+
+ - name: Precompilation
+ working-directory: ./build
+ run: cmake .. -DCMAKE_BUILD_TYPE=Release
+
+ - name: Compilation
+ working-directory: ./build
+ run: cmake --build . --config Release
+
+ - name: Setup python
+ uses: actions/setup-python@v3
+ with:
+ python-version: '3.x'
+ architecture: 'x64'
+
+ - name: Data preprocessing
+ working-directory: ./test_data
+ run: bash preprocess.sh trec_05_efficiency_queries/trec_05_efficiency_queries.completions 300
+
+ - name: Testing
+ working-directory: ./build
+ run: ctest
+
+ - name: Build binary dictionary
+ working-directory: build
+ run: chmod +x build && ./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec_05.ef_type1.bin
+
+ - name: Building docker image
+ run: docker build -t ${{ secrets.DOCKERHUB_USERNAME }}/autocomplete:latest .
+
+ - name: Dockerhub Authentication
+ run: docker login --username ${{ secrets.DOCKERHUB_USERNAME }} --password ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}
+
+ - name: Publishing image to Container Registry
+ if: github.ref == 'refs/heads/master'
+ run: docker push ${{ secrets.DOCKERHUB_USERNAME }}/autocomplete:latest
diff --git a/CMakeLists.txt b/archive/CMakeLists.txt
similarity index 79%
rename from CMakeLists.txt
rename to archive/CMakeLists.txt
index 4c90e49..9b3c162 100644
--- a/CMakeLists.txt
+++ b/archive/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.5)
project(AUTOCOMPLETE)
if(CMAKE_BUILD_TYPE MATCHES Debug)
@@ -21,7 +21,7 @@ endif ()
if(UNIX)
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
@@ -50,4 +50,11 @@ include_directories(${AUTOCOMPLETE_SOURCE_DIR}/include)
add_subdirectory(external)
add_subdirectory(src)
add_subdirectory(benchmark)
-add_subdirectory(test)
\ No newline at end of file
+
+enable_testing()
+file(GLOB TEST_SOURCES test/test_*.cpp)
+foreach(TEST_SRC ${TEST_SOURCES})
+ get_filename_component (TEST_SRC_NAME ${TEST_SRC} NAME_WE) # without extension
+ add_executable(${TEST_SRC_NAME} ${TEST_SRC})
+ add_test(${TEST_SRC_NAME} ${TEST_SRC_NAME})
+endforeach(TEST_SRC)
diff --git a/archive/Dockerfile b/archive/Dockerfile
new file mode 100644
index 0000000..f29c164
--- /dev/null
+++ b/archive/Dockerfile
@@ -0,0 +1,25 @@
+FROM ubuntu:latest
+
+EXPOSE 8000
+
+RUN groupadd appgroup && useradd appuser -G appgroup
+
+COPY . /src
+
+WORKDIR /app
+
+RUN apt update && apt install -y cmake g++ python3
+
+RUN cmake /src && cmake --build .
+
+RUN chmod +x web_server && chmod +x build
+
+RUN ./build ef_type1 /src/test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec_05.ef_type1.bin
+
+RUN apt purge -y cmake g++ python3
+
+RUN rm -rf /src
+
+USER appuser
+
+CMD ["./web_server", "8000", "trec_05.ef_type1.bin"]
diff --git a/benchmark/CMakeLists.txt b/archive/benchmark/CMakeLists.txt
similarity index 59%
rename from benchmark/CMakeLists.txt
rename to archive/benchmark/CMakeLists.txt
index cf8359f..8f2c632 100644
--- a/benchmark/CMakeLists.txt
+++ b/archive/benchmark/CMakeLists.txt
@@ -1,5 +1,7 @@
-add_executable(benchmark_topk benchmark_topk.cpp)
+# add_executable(benchmark_topk benchmark_topk.cpp)
add_executable(benchmark_prefix_topk benchmark_prefix_topk.cpp)
add_executable(benchmark_conjunctive_topk benchmark_conjunctive_topk.cpp)
add_executable(benchmark_fc_dictionary benchmark_fc_dictionary.cpp)
-add_executable(benchmark_integer_fc_dictionary benchmark_integer_fc_dictionary.cpp)
\ No newline at end of file
+add_executable(benchmark_integer_fc_dictionary benchmark_integer_fc_dictionary.cpp)
+add_executable(benchmark_locate_prefix benchmark_locate_prefix.cpp)
+add_executable(effectiveness effectiveness.cpp)
\ No newline at end of file
diff --git a/archive/benchmark/benchmark_common.hpp b/archive/benchmark/benchmark_common.hpp
new file mode 100644
index 0000000..1a96333
--- /dev/null
+++ b/archive/benchmark/benchmark_common.hpp
@@ -0,0 +1,131 @@
+#pragma once
+
+#include "../external/cmd_line_parser/include/parser.hpp"
+#include "probe.hpp"
+
+namespace autocomplete {
+
+namespace benchmarking {
+static const uint32_t runs = 5;
+}
+
+// void tolower(std::string& str) {
+// std::transform(str.begin(), str.end(), str.begin(),
+// [](unsigned char c) { return std::tolower(c); });
+// }
+
+size_t load_queries(std::vector& queries, uint32_t max_num_queries,
+ float percentage, std::istream& is = std::cin) {
+ assert(percentage >= 0.0 and percentage <= 1.0);
+ std::string query;
+ queries.reserve(max_num_queries);
+ for (uint32_t i = 0; i != max_num_queries; ++i) {
+ if (!std::getline(is, query)) break;
+ assert(query.size() > 0);
+ size_t size = query.size() - 1;
+ while (size > 0 and query[size] != ' ') --size;
+ size_t last_token_size = query.size() - size;
+ size_t end = size + std::ceil(last_token_size * percentage) + 1 +
+ 1; // retain at least one char
+ for (size = query.size(); size > end; --size) query.pop_back();
+ // tolower(query);
+ queries.push_back(query);
+ }
+ return queries.size();
+}
+
+void configure_parser_for_benchmarking(cmd_line_parser::parser& parser) {
+ parser.add("type", "Index type.");
+ parser.add("k", "top-k value.");
+ parser.add("index_filename", "Index filename.");
+ parser.add("num_terms_per_query", "Number of terms per query.");
+ parser.add("max_num_queries", "Maximum number of queries to execute.");
+ parser.add("percentage",
+ "A float in [0,1] specifying how much we keep of the last token "
+ "in a query: n x 100 <=> n%, for n in [0,1].");
+}
+
+#define BENCHMARK(what) \
+ template \
+ void benchmark(std::string const& index_filename, uint32_t k, \
+ uint32_t max_num_queries, float keep, \
+ essentials::json_lines& breakdowns) { \
+ Index index; \
+ essentials::load(index, index_filename.c_str()); \
+ \
+ std::vector queries; \
+ uint32_t num_queries = \
+ load_queries(queries, max_num_queries, keep, std::cin); \
+ \
+ uint64_t reported_strings = 0; \
+ auto musec_per_query = [&](double time) { \
+ return time / (benchmarking::runs * num_queries); \
+ }; \
+ \
+ breakdowns.add("num_queries", std::to_string(num_queries)); \
+ \
+ timer_probe probe(3); \
+ for (uint32_t run = 0; run != benchmarking::runs; ++run) { \
+ for (auto const& query : queries) { \
+ auto it = index.what##topk(query, k, probe); \
+ reported_strings += it.size(); \
+ } \
+ } \
+ std::cout << "#ignore: " << reported_strings << std::endl; \
+ \
+ breakdowns.add("reported_strings", \
+ std::to_string(reported_strings / benchmarking::runs)); \
+ breakdowns.add( \
+ "parsing_musec_per_query", \
+ std::to_string(musec_per_query(probe.get(0).elapsed()))); \
+ breakdowns.add( \
+ std::string(#what) + "search_musec_per_query", \
+ std::to_string(musec_per_query(probe.get(1).elapsed()))); \
+ breakdowns.add( \
+ "reporting_musec_per_query", \
+ std::to_string(musec_per_query(probe.get(2).elapsed()))); \
+ breakdowns.add( \
+ "total_musec_per_query", \
+ std::to_string(musec_per_query(probe.get(0).elapsed()) + \
+ musec_per_query(probe.get(1).elapsed()) + \
+ musec_per_query(probe.get(2).elapsed()))); \
+ } \
+ \
+ int main(int argc, char** argv) { \
+ cmd_line_parser::parser parser(argc, argv); \
+ configure_parser_for_benchmarking(parser); \
+ if (!parser.parse()) return 1; \
+ \
+ auto type = parser.get("type"); \
+ auto k = parser.get("k"); \
+ auto index_filename = parser.get("index_filename"); \
+ auto max_num_queries = parser.get("max_num_queries"); \
+ auto keep = parser.get("percentage"); \
+ \
+ essentials::json_lines breakdowns; \
+ breakdowns.new_line(); \
+ breakdowns.add("num_terms_per_query", \
+ parser.get("num_terms_per_query")); \
+ breakdowns.add("percentage", std::to_string(keep)); \
+ \
+ if (type == "ef_type1") { \
+ benchmark( \
+ index_filename, k, max_num_queries, keep, breakdowns); \
+ } else if (type == "ef_type2") { \
+ benchmark( \
+ index_filename, k, max_num_queries, keep, breakdowns); \
+ } else if (type == "ef_type3") { \
+ benchmark( \
+ index_filename, k, max_num_queries, keep, breakdowns); \
+ } else if (type == "ef_type4") { \
+ benchmark( \
+ index_filename, k, max_num_queries, keep, breakdowns); \
+ } else { \
+ return 1; \
+ } \
+ \
+ breakdowns.print(); \
+ return 0; \
+ }
+
+} // namespace autocomplete
\ No newline at end of file
diff --git a/archive/benchmark/benchmark_conjunctive_topk.cpp b/archive/benchmark/benchmark_conjunctive_topk.cpp
new file mode 100644
index 0000000..df14c84
--- /dev/null
+++ b/archive/benchmark/benchmark_conjunctive_topk.cpp
@@ -0,0 +1,7 @@
+#include
+
+#include "types.hpp"
+#include "benchmark_common.hpp"
+
+using namespace autocomplete;
+BENCHMARK(conjunctive_)
\ No newline at end of file
diff --git a/benchmark/benchmark_fc_dictionary.cpp b/archive/benchmark/benchmark_fc_dictionary.cpp
similarity index 52%
rename from benchmark/benchmark_fc_dictionary.cpp
rename to archive/benchmark/benchmark_fc_dictionary.cpp
index f566edd..d3e66b5 100644
--- a/benchmark/benchmark_fc_dictionary.cpp
+++ b/archive/benchmark/benchmark_fc_dictionary.cpp
@@ -8,10 +8,10 @@ using namespace autocomplete;
template
void perf_test(Dictionary const& dict,
std::vector const& queries) {
- std::vector decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY);
+ static std::vector decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY);
essentials::timer_type timer;
- for (uint32_t i = 0; i != runs; ++i) {
+ for (uint32_t i = 0; i != benchmarking::runs; ++i) {
timer.start();
for (auto const& query : queries) {
id_type id = dict.locate(string_to_byte_range(query));
@@ -20,8 +20,8 @@ void perf_test(Dictionary const& dict,
timer.stop();
}
- std::cout << "locate: " << (timer.average() * 1000.0) / queries.size()
- << " [ns/string]" << std::endl;
+ std::cout << "locate: " << timer.average() / queries.size()
+ << " [musec/string]" << std::endl;
std::vector ids;
ids.reserve(queries.size());
@@ -32,7 +32,7 @@ void perf_test(Dictionary const& dict,
timer.reset();
- for (uint32_t i = 0; i != runs; ++i) {
+ for (uint32_t i = 0; i != benchmarking::runs; ++i) {
timer.start();
for (auto const& id : ids) {
uint8_t string_len = dict.extract(id, decoded.data());
@@ -41,8 +41,30 @@ void perf_test(Dictionary const& dict,
timer.stop();
}
- std::cout << "extract: " << (timer.average() * 1000.0) / ids.size()
- << " [ns/string]" << std::endl;
+ std::cout << "extract: " << timer.average() / ids.size()
+ << " [musec/string]" << std::endl;
+
+ static std::vector percentages = {0.0, 0.25, 0.50, 0.75, 1.0};
+ for (auto p : percentages) {
+ timer.reset();
+ for (uint32_t i = 0; i != benchmarking::runs; ++i) {
+ timer.start();
+ for (auto const& query : queries) {
+ size_t size = query.size();
+ size_t n = size * p;
+ if (n == 0) n += 1; // at least one char
+ uint8_t const* addr =
+ reinterpret_cast(query.data());
+ range r = dict.locate_prefix({addr, addr + n});
+ essentials::do_not_optimize_away(r.end - r.begin);
+ }
+ timer.stop();
+ }
+
+ std::cout << "\tlocate_prefix-" << p * 100.0
+ << "%: " << timer.average() / queries.size()
+ << " [musec/string]" << std::endl;
+ }
}
#define exe(BUCKET_SIZE) \
@@ -57,30 +79,29 @@ void perf_test(Dictionary const& dict,
}
int main(int argc, char** argv) {
- int mandatory = 2 + 1;
- if (argc < mandatory) {
- std::cout << argv[0] << " < queries"
- << std::endl;
- return 1;
- }
+ cmd_line_parser::parser parser(argc, argv);
+ parser.add("collection_basename", "Collection basename.");
+ parser.add("max_num_queries", "Maximum number of queries to execute.");
+ if (!parser.parse()) return 1;
parameters params;
- params.collection_basename = argv[1];
+ params.collection_basename = parser.get("collection_basename");
params.load();
- uint32_t num_queries = std::atoi(argv[2]);
+ auto max_num_queries = parser.get("max_num_queries");
essentials::logger("loading queries...");
std::vector queries;
- queries.reserve(num_queries);
+ queries.reserve(max_num_queries);
std::string query;
query.reserve(2 * constants::MAX_NUM_CHARS_PER_QUERY);
- for (uint32_t i = 0; i != num_queries; ++i) {
+ for (uint32_t i = 0; i != max_num_queries; ++i) {
if (!std::getline(std::cin, query)) break;
queries.push_back(std::move(query));
}
- num_queries = queries.size();
- essentials::logger("loaded " + std::to_string(num_queries) + " queries");
+ max_num_queries = queries.size();
+ essentials::logger("loaded " + std::to_string(max_num_queries) +
+ " queries");
exe(4) exe(8) exe(16) exe(32) exe(64) exe(128) exe(256) return 0;
}
\ No newline at end of file
diff --git a/benchmark/benchmark_integer_fc_dictionary.cpp b/archive/benchmark/benchmark_integer_fc_dictionary.cpp
similarity index 94%
rename from benchmark/benchmark_integer_fc_dictionary.cpp
rename to archive/benchmark/benchmark_integer_fc_dictionary.cpp
index f1e35d9..8cb2b32 100644
--- a/benchmark/benchmark_integer_fc_dictionary.cpp
+++ b/archive/benchmark/benchmark_integer_fc_dictionary.cpp
@@ -8,10 +8,10 @@ using namespace autocomplete;
template
void perf_test(Dictionary const& dict, std::vector const& queries) {
- completion_type decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY);
+ static completion_type decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY);
essentials::timer_type timer;
- for (uint32_t i = 0; i != runs; ++i) {
+ for (uint32_t i = 0; i != benchmarking::runs; ++i) {
timer.start();
for (auto const& id : queries) {
uint8_t string_len = dict.extract(id, decoded);
diff --git a/archive/benchmark/benchmark_locate_prefix.cpp b/archive/benchmark/benchmark_locate_prefix.cpp
new file mode 100644
index 0000000..a9e374a
--- /dev/null
+++ b/archive/benchmark/benchmark_locate_prefix.cpp
@@ -0,0 +1,111 @@
+#include
+
+#include "types.hpp"
+#include "statistics.hpp"
+#include "benchmark_common.hpp"
+
+using namespace autocomplete;
+
+typedef std::pair query_type;
+
+template
+void benchmark(parameters const& params, std::vector& queries,
+ uint32_t num_queries, uint32_t num_terms_per_query, float keep) {
+ essentials::json_lines result;
+ result.new_line();
+ result.add("num_terms_per_query", std::to_string(num_terms_per_query));
+ result.add("percentage", std::to_string(keep));
+ result.add("num_queries", std::to_string(num_queries));
+
+ Index index;
+ {
+ typename Index::builder builder(params);
+ builder.build(index);
+ }
+
+ result.add("MiB", std::to_string(static_cast(index.bytes()) /
+ essentials::MiB));
+ result.add(
+ "bytes_per_completion",
+ std::to_string(static_cast(index.bytes()) / index.size()));
+
+ essentials::timer_type timer;
+ timer.start();
+ for (uint32_t run = 0; run != benchmarking::runs; ++run) {
+ for (auto& query : queries) {
+ auto r = index.locate_prefix(query.first, query.second);
+ essentials::do_not_optimize_away(r.end - r.begin);
+ }
+ }
+ timer.stop();
+ result.add(
+ "musec_per_query",
+ std::to_string(timer.elapsed() / (benchmarking::runs * num_queries)));
+ result.print();
+}
+
+int main(int argc, char** argv) {
+ cmd_line_parser::parser parser(argc, argv);
+ parser.add("type", "Index type.");
+ parser.add("collection_basename", "Collection basename.");
+ parser.add("num_terms_per_query", "Number of terms per query.");
+ parser.add("max_num_queries", "Maximum number of queries to execute.");
+ parser.add("percentage",
+ "A float in [0,1] specifying how much we keep of the last token "
+ "in a query.");
+ if (!parser.parse()) return 1;
+
+ parameters params;
+ params.collection_basename = parser.get("collection_basename");
+ params.load();
+
+ auto type = parser.get("type");
+ auto max_num_queries = parser.get("max_num_queries");
+ auto num_terms_per_query = parser.get("num_terms_per_query");
+ auto keep = parser.get("percentage");
+
+ fc_dictionary_type dict;
+ {
+ fc_dictionary_type::builder builder(params);
+ builder.build(dict);
+ }
+
+ std::vector strings;
+ std::vector queries;
+ uint32_t num_queries = 0;
+
+ {
+ num_queries = load_queries(strings, max_num_queries, keep, std::cin);
+ for (auto const& string : strings) {
+ completion_type prefix;
+ byte_range suffix;
+ parse(dict, string, prefix, suffix, true);
+ range suffix_lex_range = dict.locate_prefix(suffix);
+ queries.emplace_back(prefix, suffix_lex_range);
+ }
+ }
+
+ if (type == "trie") {
+ benchmark(params, queries, num_queries,
+ num_terms_per_query, keep);
+ } else if (type == "fc") {
+ // benchmark>(params, queries, num_queries,
+ // num_terms_per_query, keep);
+ // benchmark>(params, queries, num_queries,
+ // num_terms_per_query, keep);
+ benchmark>(params, queries, num_queries,
+ num_terms_per_query, keep);
+ // benchmark>(params, queries, num_queries,
+ // num_terms_per_query, keep);
+ // benchmark>(params, queries, num_queries,
+ // num_terms_per_query, keep);
+ // benchmark>(params, queries, num_queries,
+ // num_terms_per_query, keep);
+ // benchmark>(params, queries, num_queries,
+ // num_terms_per_query, keep);
+ } else {
+ return 1;
+ }
+
+ return 0;
+}
\ No newline at end of file
diff --git a/archive/benchmark/benchmark_prefix_topk.cpp b/archive/benchmark/benchmark_prefix_topk.cpp
new file mode 100644
index 0000000..69a0bc1
--- /dev/null
+++ b/archive/benchmark/benchmark_prefix_topk.cpp
@@ -0,0 +1,7 @@
+#include
+
+#include "types.hpp"
+#include "benchmark_common.hpp"
+
+using namespace autocomplete;
+BENCHMARK(prefix_)
\ No newline at end of file
diff --git a/archive/benchmark/benchmark_topk.cpp b/archive/benchmark/benchmark_topk.cpp
new file mode 100644
index 0000000..98d208c
--- /dev/null
+++ b/archive/benchmark/benchmark_topk.cpp
@@ -0,0 +1,7 @@
+#include
+
+#include "types.hpp"
+#include "benchmark_common.hpp"
+
+using namespace autocomplete;
+BENCHMARK("")
\ No newline at end of file
diff --git a/archive/benchmark/effectiveness.cpp b/archive/benchmark/effectiveness.cpp
new file mode 100644
index 0000000..e9c6590
--- /dev/null
+++ b/archive/benchmark/effectiveness.cpp
@@ -0,0 +1,137 @@
+#include
+
+#include "types.hpp"
+#include "benchmark_common.hpp"
+
+using namespace autocomplete;
+
+template
+void benchmark(std::string const& index_filename, uint32_t k,
+ uint32_t max_num_queries, float keep,
+ essentials::json_lines& stats, bool verbose) {
+ Index index1, index2;
+ essentials::load(index1, index_filename.c_str());
+ essentials::load(index2, index_filename.c_str());
+
+ std::vector queries;
+ uint32_t num_queries =
+ load_queries(queries, max_num_queries, keep, std::cin);
+ uint64_t strings_reported_by_prefix_search = 0;
+ uint64_t better_scored_strings_reported_by_conjunctive_search = 0;
+
+ stats.add("num_queries", std::to_string(num_queries));
+
+ std::vector difference;
+ difference.reserve(k);
+ nop_probe probe;
+
+ for (auto const& query : queries) {
+ auto it1 = index1.prefix_topk(query, k, probe);
+ auto it2 = index2.conjunctive_topk(query, k, probe);
+ strings_reported_by_prefix_search += it1.size();
+
+ uint64_t more = 0;
+ assert(it2.size() >= it1.size());
+
+ auto const& prefix_search_scores = it1.pool()->const_scores();
+ auto const& conjunctive_search_scores = it2.pool()->const_scores();
+ assert(std::is_sorted(prefix_search_scores.begin(),
+ prefix_search_scores.begin() + it1.size()));
+ assert(std::is_sorted(conjunctive_search_scores.begin(),
+ conjunctive_search_scores.begin() + it2.size()));
+
+ if (verbose) {
+ std::cout << "query: '" << query << "'" << std::endl;
+ {
+ auto it = it1;
+ std::cout << "prefix_search results: " << it.size()
+ << std::endl;
+ for (uint64_t i = 0; i != it.size(); ++i, ++it) {
+ auto completion = *it;
+ std::cout << completion.score << ": "
+ << std::string(completion.string.begin,
+ completion.string.end)
+ << std::endl;
+ }
+ }
+ {
+ auto it = it2;
+ std::cout << "conjunctive_search results: " << it.size()
+ << std::endl;
+ for (uint64_t i = 0; i != it.size(); ++i, ++it) {
+ auto completion = *it;
+ std::cout << completion.score << ": "
+ << std::string(completion.string.begin,
+ completion.string.end)
+ << std::endl;
+ }
+ }
+ }
+
+ difference.clear();
+ auto it = std::set_difference(
+ conjunctive_search_scores.begin(),
+ conjunctive_search_scores.begin() + it2.size(),
+ prefix_search_scores.begin(),
+ prefix_search_scores.begin() + it1.size(), difference.begin());
+ more = std::distance(difference.begin(), it);
+ if (verbose) std::cout << "more: " << more << std::endl;
+ better_scored_strings_reported_by_conjunctive_search += more;
+ }
+
+ stats.add("strings_reported_by_prefix_search",
+ std::to_string(strings_reported_by_prefix_search));
+ stats.add(
+ "better_scored_strings_reported_by_conjunctive_search",
+ std::to_string(better_scored_strings_reported_by_conjunctive_search));
+ stats.add(
+ "better_scored_strings_reported_by_conjunctive_search_in_percentage",
+ std::to_string(better_scored_strings_reported_by_conjunctive_search *
+ 100.0 / strings_reported_by_prefix_search));
+}
+
+int main(int argc, char** argv) {
+ cmd_line_parser::parser parser(argc, argv);
+ parser.add("type", "Index type.");
+ parser.add("k", "top-k value.");
+ parser.add("index_filename", "Index filename.");
+ parser.add("num_terms_per_query", "Number of terms per query.");
+ parser.add("max_num_queries", "Maximum number of queries to execute.");
+ parser.add("percentage",
+ "A float in [0,1] specifying how much we keep of the last token "
+ "in a query: n x 100 <=> n%, for n in [0,1].");
+ parser.add("verbose", "Verbose output.", "--verbose");
+ if (!parser.parse()) return 1;
+
+ auto type = parser.get("type");
+ auto k = parser.get("k");
+ auto index_filename = parser.get("index_filename");
+ auto max_num_queries = parser.get("max_num_queries");
+ auto keep = parser.get("percentage");
+ auto verbose = parser.get("verbose");
+
+ essentials::json_lines stats;
+ stats.new_line();
+ stats.add("num_terms_per_query",
+ parser.get("num_terms_per_query"));
+ stats.add("percentage", std::to_string(keep));
+
+ if (type == "ef_type1") {
+ benchmark(index_filename, k, max_num_queries,
+ keep, stats, verbose);
+ } else if (type == "ef_type2") {
+ benchmark(index_filename, k, max_num_queries,
+ keep, stats, verbose);
+ } else if (type == "ef_type3") {
+ benchmark(index_filename, k, max_num_queries,
+ keep, stats, verbose);
+ } else if (type == "ef_type4") {
+ benchmark(index_filename, k, max_num_queries,
+ keep, stats, verbose);
+ } else {
+ return 1;
+ }
+
+ stats.print();
+ return 0;
+}
\ No newline at end of file
diff --git a/archive/example.sh b/archive/example.sh
new file mode 100644
index 0000000..4ac00bf
--- /dev/null
+++ b/archive/example.sh
@@ -0,0 +1,3 @@
+cd build
+./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec_05.ef_type1.bin
+./web_server 8000 trec_05.ef_type1.bin
\ No newline at end of file
diff --git a/archive/external/CMakeLists.txt b/archive/external/CMakeLists.txt
new file mode 100644
index 0000000..5d0ee92
--- /dev/null
+++ b/archive/external/CMakeLists.txt
@@ -0,0 +1,4 @@
+include_directories(essentials/include)
+
+set(DOCTEST_INCLUDE_DIR ${AUTOCOMPLETE_SOURCE_DIR}/external/doctest)
+include_directories(${DOCTEST_INCLUDE_DIR})
\ No newline at end of file
diff --git a/archive/include/autocomplete.hpp b/archive/include/autocomplete.hpp
new file mode 100644
index 0000000..78e54ad
--- /dev/null
+++ b/archive/include/autocomplete.hpp
@@ -0,0 +1,223 @@
+#pragma once
+
+#include "util_types.hpp"
+#include "autocomplete_common.hpp"
+#include "scored_string_pool.hpp"
+#include "constants.hpp"
+
+namespace autocomplete {
+
+template
+struct autocomplete {
+ typedef scored_string_pool::iterator iterator_type;
+
+ autocomplete() {
+ m_pool.resize(constants::POOL_SIZE, constants::MAX_K);
+ }
+
+ autocomplete(parameters const& params)
+ : autocomplete() {
+ typename Completions::builder cm_builder(params);
+ typename Dictionary::builder di_builder(params);
+ typename InvertedIndex::builder ii_builder(params);
+ typename ForwardIndex::builder fi_builder(params);
+
+ m_unsorted_docs_list.build(cm_builder.doc_ids());
+ m_unsorted_minimal_docs_list.build(ii_builder.minimal_doc_ids());
+
+ cm_builder.build(m_completions);
+ di_builder.build(m_dictionary);
+ ii_builder.build(m_inverted_index);
+ fi_builder.build(m_forward_index);
+ }
+
+ template
+ iterator_type prefix_topk(std::string const& query, const uint32_t k,
+ Probe& probe) {
+ assert(k <= constants::MAX_K);
+
+ probe.start(0);
+ init();
+ completion_type prefix;
+ byte_range suffix;
+ constexpr bool must_find_prefix = true;
+ if (!parse(m_dictionary, query, prefix, suffix, must_find_prefix)) {
+ return m_pool.begin();
+ }
+ probe.stop(0);
+
+ probe.start(1);
+ range suffix_lex_range = m_dictionary.locate_prefix(suffix);
+ if (suffix_lex_range.is_invalid()) return m_pool.begin();
+ suffix_lex_range.begin += 1;
+ suffix_lex_range.end += 1;
+ range r = m_completions.locate_prefix(prefix, suffix_lex_range);
+ if (r.is_invalid()) return m_pool.begin();
+ uint32_t num_completions =
+ m_unsorted_docs_list.topk(r, k, m_pool.scores());
+ probe.stop(1);
+
+ probe.start(2);
+ auto it = extract_strings(num_completions);
+ probe.stop(2);
+
+ return it;
+ }
+
+ template
+ iterator_type conjunctive_topk(std::string const& query, const uint32_t k,
+ Probe& probe) {
+ assert(k <= constants::MAX_K);
+
+ probe.start(0);
+ init();
+ completion_type prefix;
+ byte_range suffix;
+ constexpr bool must_find_prefix = false;
+ parse(m_dictionary, query, prefix, suffix, must_find_prefix);
+ probe.stop(0);
+
+ probe.start(1);
+ range suffix_lex_range = m_dictionary.locate_prefix(suffix);
+ if (suffix_lex_range.is_invalid()) return m_pool.begin();
+ uint32_t num_completions = 0;
+ if (prefix.size() == 0) {
+ suffix_lex_range.end += 1;
+ num_completions = m_unsorted_minimal_docs_list.topk(
+ m_inverted_index, suffix_lex_range, k, m_pool.scores());
+ } else {
+ suffix_lex_range.begin += 1;
+ suffix_lex_range.end += 1;
+ num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
+ }
+ probe.stop(1);
+
+ probe.start(2);
+ auto it = extract_strings(num_completions);
+ probe.stop(2);
+
+ return it;
+ }
+
+ // iterator_type topk(std::string const& query, const uint32_t k) {
+ // assert(k <= constants::MAX_K);
+ // init();
+ // completion_type prefix;
+ // byte_range suffix;
+ // uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
+ // assert(num_terms > 0);
+
+ // range suffix_lex_range = m_dictionary.locate_prefix(suffix);
+ // if (suffix_lex_range.is_invalid()) return m_pool.begin();
+
+ // suffix_lex_range.begin += 1;
+ // suffix_lex_range.end += 1;
+ // range r = m_completions.locate_prefix(prefix, suffix_lex_range);
+
+ // uint32_t num_completions = 0;
+ // if (r.is_valid()) {
+ // num_completions = m_unsorted_docs_list.topk(r, k,
+ // m_pool.scores());
+ // }
+
+ // if (num_completions < k) {
+ // if (num_terms == 1) { // special case
+ // suffix_lex_range.begin -= 1;
+ // num_completions = m_unsorted_minimal_docs_list.topk(
+ // suffix_lex_range, k, m_pool.scores(),
+ // true // must return unique results
+ // );
+ // } else {
+ // num_completions = conjunctive_topk(prefix, suffix_lex_range,
+ // k);
+ // }
+ // }
+
+ // return extract_strings(num_completions);
+ // }
+
+ size_t bytes() const {
+ return m_completions.bytes() + m_unsorted_docs_list.bytes() +
+ m_unsorted_minimal_docs_list.bytes() + m_dictionary.bytes() +
+ m_inverted_index.bytes() + m_forward_index.bytes();
+ }
+
+ void print_stats() const;
+
+ template
+ void visit(Visitor& visitor) {
+ visitor.visit(m_completions);
+ visitor.visit(m_unsorted_docs_list);
+ visitor.visit(m_unsorted_minimal_docs_list);
+ visitor.visit(m_dictionary);
+ visitor.visit(m_inverted_index);
+ visitor.visit(m_forward_index);
+ }
+
+private:
+ Completions m_completions;
+ unsorted_list_type m_unsorted_docs_list;
+ typedef minimal_docids minimal_docids_type;
+ minimal_docids_type m_unsorted_minimal_docs_list;
+ Dictionary m_dictionary;
+ InvertedIndex m_inverted_index;
+ ForwardIndex m_forward_index;
+
+ scored_string_pool m_pool;
+
+ void init() {
+ m_pool.clear();
+ m_pool.init();
+ assert(m_pool.size() == 0);
+ }
+
+ uint32_t conjunctive_topk(completion_type& prefix, const range suffix,
+ uint32_t const k) {
+ deduplicate(prefix);
+ if (prefix.size() == 1) { // we've got nothing to intersect
+ auto it = m_inverted_index.iterator(prefix.front() - 1);
+ return conjunctive_topk(it, suffix, k);
+ }
+ auto it = m_inverted_index.intersection_iterator(prefix);
+ return conjunctive_topk(it, suffix, k);
+ }
+
+ template
+ uint32_t conjunctive_topk(Iterator& it, const range r, uint32_t const k) {
+ auto& topk_scores = m_pool.scores();
+ uint32_t results = 0;
+ for (; it.has_next(); ++it) {
+ auto doc_id = *it;
+ if (m_forward_index.intersects(doc_id, r)) {
+ topk_scores[results++] = doc_id;
+ if (results == k) break;
+ }
+ }
+ return results;
+ }
+
+ iterator_type extract_strings(const uint32_t num_completions) {
+ auto const& topk_scores = m_pool.scores();
+ for (uint32_t i = 0; i != num_completions; ++i) {
+ auto doc_id = topk_scores[i];
+ auto it = m_forward_index.iterator(doc_id);
+ uint64_t offset = m_pool.bytes();
+ uint8_t* decoded = m_pool.data() + offset;
+ for (uint32_t j = 0; j != it.size(); ++j, ++it) {
+ auto term_id = *it;
+ uint8_t len = m_dictionary.extract(term_id, decoded);
+ decoded += len;
+ offset += len;
+ if (j != it.size() - 1) {
+ *decoded++ = ' ';
+ offset++;
+ }
+ }
+ m_pool.push_back_offset(offset);
+ }
+ assert(m_pool.size() == num_completions);
+ return m_pool.begin();
+ }
+};
+} // namespace autocomplete
\ No newline at end of file
diff --git a/archive/include/autocomplete2.hpp b/archive/include/autocomplete2.hpp
new file mode 100644
index 0000000..eb3f994
--- /dev/null
+++ b/archive/include/autocomplete2.hpp
@@ -0,0 +1,256 @@
+#pragma once
+
+#include "util_types.hpp"
+#include "building_util.hpp"
+#include "compact_vector.hpp"
+#include "autocomplete_common.hpp"
+#include "scored_string_pool.hpp"
+#include "constants.hpp"
+
+namespace autocomplete {
+
+template
+struct autocomplete2 {
+ typedef scored_string_pool::iterator iterator_type;
+
+ autocomplete2() {
+ m_pool.resize(constants::POOL_SIZE, constants::MAX_K);
+ m_topk_completion_set.resize(constants::MAX_K,
+ 2 * constants::MAX_NUM_TERMS_PER_QUERY);
+ }
+
+ autocomplete2(parameters const& params)
+ : autocomplete2() {
+ typename Completions::builder cm_builder(params);
+ typename Dictionary::builder di_builder(params);
+ typename InvertedIndex::builder ii_builder(params);
+ auto const& docid_to_lexid = cm_builder.docid_to_lexid();
+ m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(),
+ util::ceil_log2(params.num_completions + 1));
+ m_unsorted_docs_list.build(
+ util::invert(docid_to_lexid, params.num_completions));
+ m_unsorted_minimal_docs_list.build(ii_builder.minimal_doc_ids());
+ cm_builder.build(m_completions);
+ di_builder.build(m_dictionary);
+ ii_builder.build(m_inverted_index);
+ }
+
+ template
+ iterator_type prefix_topk(std::string const& query, const uint32_t k,
+ Probe& probe) {
+ assert(k <= constants::MAX_K);
+
+ probe.start(0);
+ init();
+ completion_type prefix;
+ byte_range suffix;
+ constexpr bool must_find_prefix = true;
+ if (!parse(m_dictionary, query, prefix, suffix, must_find_prefix)) {
+ return m_pool.begin();
+ }
+ probe.stop(0);
+
+ probe.start(1);
+ range suffix_lex_range = m_dictionary.locate_prefix(suffix);
+ if (suffix_lex_range.is_invalid()) return m_pool.begin();
+ suffix_lex_range.begin += 1;
+ suffix_lex_range.end += 1;
+ range r = m_completions.locate_prefix(prefix, suffix_lex_range);
+ if (r.is_invalid()) return m_pool.begin();
+ uint32_t num_completions =
+ m_unsorted_docs_list.topk(r, k, m_pool.scores());
+ probe.stop(1);
+
+ probe.start(2);
+ extract_completions(num_completions);
+ auto it = extract_strings(num_completions);
+ probe.stop(2);
+
+ return it;
+ }
+
+ template
+ iterator_type conjunctive_topk(std::string const& query, const uint32_t k,
+ Probe& probe) {
+ assert(k <= constants::MAX_K);
+
+ probe.start(0);
+ init();
+ completion_type prefix;
+ byte_range suffix;
+ constexpr bool must_find_prefix = false;
+ parse(m_dictionary, query, prefix, suffix, must_find_prefix);
+ probe.stop(0);
+
+ probe.start(1);
+ range suffix_lex_range = m_dictionary.locate_prefix(suffix);
+ if (suffix_lex_range.is_invalid()) return m_pool.begin();
+ uint32_t num_completions = 0;
+ if (prefix.size() == 0) {
+ suffix_lex_range.end += 1;
+ num_completions = m_unsorted_minimal_docs_list.topk(
+ m_inverted_index, suffix_lex_range, k, m_pool.scores());
+ extract_completions(num_completions);
+ } else {
+ suffix_lex_range.begin += 1;
+ suffix_lex_range.end += 1;
+ num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
+ }
+ probe.stop(1);
+
+ probe.start(2);
+ auto it = extract_strings(num_completions);
+ probe.stop(2);
+
+ return it;
+ }
+
+ // iterator_type topk(std::string const& query, const uint32_t k) {
+ // assert(k <= constants::MAX_K);
+ // init();
+ // completion_type prefix;
+ // byte_range suffix;
+ // uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
+ // assert(num_terms > 0);
+
+ // range suffix_lex_range = m_dictionary.locate_prefix(suffix);
+ // if (suffix_lex_range.is_invalid()) return m_pool.begin();
+
+ // suffix_lex_range.begin += 1;
+ // suffix_lex_range.end += 1;
+ // range r = m_completions.locate_prefix(prefix, suffix_lex_range);
+
+ // uint32_t num_completions = 0;
+ // if (r.is_valid()) {
+ // num_completions = m_unsorted_docs_list.topk(r, k,
+ // m_pool.scores());
+ // }
+
+ // if (num_completions < k) {
+ // if (num_terms == 1) { // special case
+ // suffix_lex_range.begin -= 1;
+ // num_completions = m_unsorted_minimal_docs_list.topk(
+ // suffix_lex_range, k, m_pool.scores(),
+ // true // must return unique results
+ // );
+ // extract_completions(num_completions);
+ // } else {
+ // num_completions = conjunctive_topk(prefix, suffix_lex_range,
+ // k);
+ // }
+ // } else {
+ // extract_completions(num_completions);
+ // }
+
+ // return extract_strings(num_completions);
+ // }
+
+ size_t bytes() const {
+ return m_completions.bytes() + m_unsorted_docs_list.bytes() +
+ m_unsorted_minimal_docs_list.bytes() + m_dictionary.bytes() +
+ m_docid_to_lexid.bytes() + m_inverted_index.bytes();
+ }
+
+ void print_stats() const;
+
+ template
+ void visit(Visitor& visitor) {
+ visitor.visit(m_completions);
+ visitor.visit(m_unsorted_docs_list);
+ visitor.visit(m_unsorted_minimal_docs_list);
+ visitor.visit(m_dictionary);
+ visitor.visit(m_inverted_index);
+ visitor.visit(m_docid_to_lexid);
+ }
+
+private:
+ Completions m_completions;
+ unsorted_list_type m_unsorted_docs_list;
+ typedef minimal_docids minimal_docids_type;
+ minimal_docids_type m_unsorted_minimal_docs_list;
+ Dictionary m_dictionary;
+ InvertedIndex m_inverted_index;
+ compact_vector m_docid_to_lexid;
+
+ scored_string_pool m_pool;
+ completion_set m_topk_completion_set;
+
+ void init() {
+ m_pool.clear();
+ m_pool.init();
+ assert(m_pool.size() == 0);
+ }
+
+ void extract_completions(const uint32_t num_completions) {
+ auto const& topk_scores = m_pool.scores();
+ auto& completions = m_topk_completion_set.completions();
+ auto& sizes = m_topk_completion_set.sizes();
+ for (uint32_t i = 0; i != num_completions; ++i) {
+ auto doc_id = topk_scores[i];
+ auto lex_id = m_docid_to_lexid[doc_id];
+ uint8_t size = m_completions.extract(lex_id, completions[i]);
+ sizes[i] = size;
+ }
+ }
+
+ uint32_t conjunctive_topk(completion_type& prefix, const range suffix,
+ uint32_t const k) {
+ deduplicate(prefix);
+ if (prefix.size() == 1) { // we've got nothing to intersect
+ auto it = m_inverted_index.iterator(prefix.front() - 1);
+ return conjunctive_topk(it, suffix, k);
+ }
+ auto it = m_inverted_index.intersection_iterator(prefix);
+ return conjunctive_topk(it, suffix, k);
+ }
+
+ template
+ uint32_t conjunctive_topk(Iterator& it, const range r, const uint32_t k) {
+ auto& topk_scores = m_pool.scores();
+ auto& completions = m_topk_completion_set.completions();
+ auto& sizes = m_topk_completion_set.sizes();
+ uint32_t i = 0;
+
+ for (; it.has_next(); ++it) {
+ auto doc_id = *it;
+ auto lex_id = m_docid_to_lexid[doc_id];
+ uint32_t size = m_completions.extract(lex_id, completions[i]);
+ for (uint32_t j = 0; j != size; ++j) {
+ if (r.contains(completions[i][j])) {
+ topk_scores[i] = doc_id;
+ sizes[i] = size;
+ ++i;
+ if (i == k) return k;
+ break;
+ }
+ }
+ }
+
+ return i;
+ }
+
+ iterator_type extract_strings(const uint32_t num_completions) {
+ auto const& completions = m_topk_completion_set.completions();
+ auto const& sizes = m_topk_completion_set.sizes();
+ for (uint32_t i = 0; i != num_completions; ++i) {
+ auto const& c = completions[i];
+ uint32_t size = sizes[i];
+ uint64_t offset = m_pool.bytes();
+ uint8_t* decoded = m_pool.data() + offset;
+ for (uint32_t j = 0; j != size; ++j) {
+ auto term_id = c[j];
+ uint8_t len = m_dictionary.extract(term_id, decoded);
+ decoded += len;
+ offset += len;
+ if (j != size - 1) {
+ *decoded++ = ' ';
+ offset++;
+ }
+ }
+ m_pool.push_back_offset(offset);
+ }
+ assert(m_pool.size() == num_completions);
+ return m_pool.begin();
+ }
+};
+} // namespace autocomplete
\ No newline at end of file
diff --git a/archive/include/autocomplete3.hpp b/archive/include/autocomplete3.hpp
new file mode 100644
index 0000000..6765ad6
--- /dev/null
+++ b/archive/include/autocomplete3.hpp
@@ -0,0 +1,264 @@
+#pragma once
+
+#include "util_types.hpp"
+#include "building_util.hpp"
+#include "compact_vector.hpp"
+#include "autocomplete_common.hpp"
+#include "scored_string_pool.hpp"
+#include "constants.hpp"
+
+namespace autocomplete {
+
+/*
+During the conjunctive step, maintain a min-heap of iterators,
+one iterator for each termID in the lexicographic range of the
+last token of the query.
+*/
+
+template
+struct autocomplete3 {
+ typedef scored_string_pool::iterator iterator_type;
+ typedef min_heap>
+ min_priority_queue_type;
+
+ autocomplete3() {
+ m_pool.resize(constants::POOL_SIZE, constants::MAX_K);
+ m_topk_completion_set.resize(constants::MAX_K,
+ 2 * constants::MAX_NUM_TERMS_PER_QUERY);
+ }
+
+ autocomplete3(parameters const& params)
+ : autocomplete3() {
+ typename Completions::builder cm_builder(params);
+ typename Dictionary::builder di_builder(params);
+ typename InvertedIndex::builder ii_builder(params);
+ auto const& docid_to_lexid = cm_builder.docid_to_lexid();
+ m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(),
+ util::ceil_log2(params.num_completions + 1));
+ m_unsorted_docs_list.build(
+ util::invert(docid_to_lexid, params.num_completions));
+ cm_builder.build(m_completions);
+ di_builder.build(m_dictionary);
+ ii_builder.build(m_inverted_index);
+ }
+
+ template
+ iterator_type prefix_topk(std::string const& query, const uint32_t k,
+ Probe& probe) {
+ assert(k <= constants::MAX_K);
+
+ probe.start(0);
+ init();
+ completion_type prefix;
+ byte_range suffix;
+ constexpr bool must_find_prefix = true;
+ if (!parse(m_dictionary, query, prefix, suffix, must_find_prefix)) {
+ return m_pool.begin();
+ }
+ probe.stop(0);
+
+ probe.start(1);
+ range suffix_lex_range = m_dictionary.locate_prefix(suffix);
+ if (suffix_lex_range.is_invalid()) return m_pool.begin();
+ suffix_lex_range.begin += 1;
+ suffix_lex_range.end += 1;
+ range r = m_completions.locate_prefix(prefix, suffix_lex_range);
+ if (r.is_invalid()) return m_pool.begin();
+ uint32_t num_completions =
+ m_unsorted_docs_list.topk(r, k, m_pool.scores());
+ probe.stop(1);
+
+ probe.start(2);
+ extract_completions(num_completions);
+ auto it = extract_strings(num_completions);
+ probe.stop(2);
+
+ return it;
+ }
+
+ template
+ iterator_type conjunctive_topk(std::string const& query, const uint32_t k,
+ Probe& probe) {
+ assert(k <= constants::MAX_K);
+
+ probe.start(0);
+ init();
+ completion_type prefix;
+ byte_range suffix;
+ constexpr bool must_find_prefix = false;
+ parse(m_dictionary, query, prefix, suffix, must_find_prefix);
+ probe.stop(0);
+
+ probe.start(1);
+ uint32_t num_completions = 0;
+ range suffix_lex_range = m_dictionary.locate_prefix(suffix);
+ if (suffix_lex_range.is_invalid()) return m_pool.begin();
+ suffix_lex_range.begin += 1;
+ suffix_lex_range.end += 1;
+ num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
+ probe.stop(1);
+
+ probe.start(2);
+ extract_completions(num_completions);
+ auto it = extract_strings(num_completions);
+ probe.stop(2);
+
+ return it;
+ }
+
+ // iterator_type topk(std::string const& query, const uint32_t k) {
+ // assert(k <= constants::MAX_K);
+ // init();
+ // completion_type prefix;
+ // byte_range suffix;
+ // uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
+ // assert(num_terms > 0);
+
+ // range suffix_lex_range = m_dictionary.locate_prefix(suffix);
+ // if (suffix_lex_range.is_invalid()) return m_pool.begin();
+
+ // suffix_lex_range.begin += 1;
+ // suffix_lex_range.end += 1;
+ // range r = m_completions.locate_prefix(prefix, suffix_lex_range);
+
+ // uint32_t num_completions = 0;
+ // if (r.is_valid()) {
+ // num_completions = m_unsorted_docs_list.topk(r, k,
+ // m_pool.scores());
+ // }
+
+ // if (num_completions < k) {
+ // num_completions =
+ // conjunctive_topk(num_terms, prefix, suffix_lex_range, k);
+ // }
+
+ // extract_completions(num_completions);
+ // return extract_strings(num_completions);
+ // }
+
+ size_t bytes() const {
+ return m_completions.bytes() + m_unsorted_docs_list.bytes() +
+ m_dictionary.bytes() + m_docid_to_lexid.bytes() +
+ m_inverted_index.bytes();
+ }
+
+ void print_stats() const;
+
+ template
+ void visit(Visitor& visitor) {
+ visitor.visit(m_completions);
+ visitor.visit(m_unsorted_docs_list);
+ visitor.visit(m_dictionary);
+ visitor.visit(m_inverted_index);
+ visitor.visit(m_docid_to_lexid);
+ }
+
+private:
+ Completions m_completions;
+ unsorted_list_type m_unsorted_docs_list;
+ Dictionary m_dictionary;
+ InvertedIndex m_inverted_index;
+ compact_vector m_docid_to_lexid;
+
+ scored_string_pool m_pool;
+ completion_set m_topk_completion_set;
+
+ void init() {
+ m_pool.clear();
+ m_pool.init();
+ assert(m_pool.size() == 0);
+ }
+
+ void extract_completions(const uint32_t num_completions) {
+ auto const& topk_scores = m_pool.scores();
+ auto& completions = m_topk_completion_set.completions();
+ auto& sizes = m_topk_completion_set.sizes();
+ for (uint32_t i = 0; i != num_completions; ++i) {
+ auto doc_id = topk_scores[i];
+ auto lex_id = m_docid_to_lexid[doc_id];
+ uint8_t size = m_completions.extract(lex_id, completions[i]);
+ sizes[i] = size;
+ }
+ }
+
+ uint32_t conjunctive_topk(completion_type& prefix,
+ const range suffix_lex_range, const uint32_t k) {
+ if (prefix.size() == 0) { // we've got nothing to intersect
+ return heap_topk(m_inverted_index, suffix_lex_range, k,
+ m_pool.scores());
+ }
+ deduplicate(prefix);
+ if (prefix.size() == 1) { // we've got nothing to intersect
+ auto it = m_inverted_index.iterator(prefix.front() - 1);
+ return conjunctive_topk(it, suffix_lex_range, k);
+ }
+ auto it = m_inverted_index.intersection_iterator(prefix);
+ return conjunctive_topk(it, suffix_lex_range, k);
+ }
+
+ template
+ uint32_t conjunctive_topk(Iterator& it, const range r, const uint32_t k) {
+ assert(r.is_valid());
+
+ auto& topk_scores = m_pool.scores();
+ min_priority_queue_type q;
+ q.reserve(r.end - r.begin + 1); // inclusive range
+ assert(r.begin > 0);
+ for (uint64_t term_id = r.begin; term_id <= r.end; ++term_id) {
+ q.push_back(m_inverted_index.iterator(term_id - 1));
+ }
+ q.make_heap();
+
+ uint32_t results = 0;
+ for (; it.has_next() and !q.empty(); ++it) {
+ auto doc_id = *it;
+ while (!q.empty()) {
+ auto& z = q.top();
+ auto val = *z;
+ if (val > doc_id) break;
+ if (val < doc_id) {
+ val = z.next_geq(doc_id);
+ if (!z.has_next()) {
+ q.pop();
+ } else {
+ q.heapify();
+ }
+ }
+ if (val == doc_id) { // NOTE: putting else here seems to slow
+ // down the code!
+ topk_scores[results++] = doc_id;
+ if (results == k) return results;
+ break;
+ }
+ }
+ }
+
+ return results;
+ }
+
+ iterator_type extract_strings(const uint32_t num_completions) {
+ auto const& completions = m_topk_completion_set.completions();
+ auto const& sizes = m_topk_completion_set.sizes();
+ for (uint32_t i = 0; i != num_completions; ++i) {
+ auto const& c = completions[i];
+ uint32_t size = sizes[i];
+ uint64_t offset = m_pool.bytes();
+ uint8_t* decoded = m_pool.data() + offset;
+ for (uint32_t j = 0; j != size; ++j) {
+ auto term_id = c[j];
+ uint8_t len = m_dictionary.extract(term_id, decoded);
+ decoded += len;
+ offset += len;
+ if (j != size - 1) {
+ *decoded++ = ' ';
+ offset++;
+ }
+ }
+ m_pool.push_back_offset(offset);
+ }
+ assert(m_pool.size() == num_completions);
+ return m_pool.begin();
+ }
+};
+} // namespace autocomplete
\ No newline at end of file
diff --git a/archive/include/autocomplete4.hpp b/archive/include/autocomplete4.hpp
new file mode 100644
index 0000000..7d84bae
--- /dev/null
+++ b/archive/include/autocomplete4.hpp
@@ -0,0 +1,290 @@
+#pragma once
+
+#include "util_types.hpp"
+#include "building_util.hpp"
+#include "compact_vector.hpp"
+#include "autocomplete_common.hpp"
+#include "scored_string_pool.hpp"
+#include "constants.hpp"
+
+namespace autocomplete {
+
+/* Bast and Weber approach. */
+
+template
+struct autocomplete4 {
+ typedef scored_string_pool::iterator iterator_type;
+
+ autocomplete4() {
+ m_pool.resize(constants::POOL_SIZE, constants::MAX_K);
+ m_topk_completion_set.resize(constants::MAX_K,
+ 2 * constants::MAX_NUM_TERMS_PER_QUERY);
+ }
+
+ autocomplete4(parameters const& params, float c)
+ : autocomplete4() {
+ typename Completions::builder cm_builder(params);
+ typename Dictionary::builder di_builder(params);
+ typename BlockedInvertedIndex::builder ii_builder(params, c);
+ auto const& docid_to_lexid = cm_builder.docid_to_lexid();
+ m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(),
+ util::ceil_log2(params.num_completions + 1));
+ m_unsorted_docs_list.build(
+ util::invert(docid_to_lexid, params.num_completions));
+ cm_builder.build(m_completions);
+ di_builder.build(m_dictionary);
+ ii_builder.build(m_inverted_index);
+ }
+
+ template
+ iterator_type prefix_topk(std::string const& query, const uint32_t k,
+ Probe& probe) {
+ assert(k <= constants::MAX_K);
+
+ probe.start(0);
+ init();
+ completion_type prefix;
+ byte_range suffix;
+ constexpr bool must_find_prefix = true;
+ if (!parse(m_dictionary, query, prefix, suffix, must_find_prefix)) {
+ return m_pool.begin();
+ }
+ probe.stop(0);
+
+ probe.start(1);
+ range suffix_lex_range = m_dictionary.locate_prefix(suffix);
+ if (suffix_lex_range.is_invalid()) return m_pool.begin();
+ suffix_lex_range.begin += 1;
+ suffix_lex_range.end += 1;
+ range r = m_completions.locate_prefix(prefix, suffix_lex_range);
+ if (r.is_invalid()) return m_pool.begin();
+ uint32_t num_completions =
+ m_unsorted_docs_list.topk(r, k, m_pool.scores());
+ probe.stop(1);
+
+ probe.start(2);
+ extract_completions(num_completions);
+ auto it = extract_strings(num_completions);
+ probe.stop(2);
+
+ return it;
+ }
+
+ template
+ iterator_type conjunctive_topk(std::string const& query, const uint32_t k,
+ Probe& probe) {
+ assert(k <= constants::MAX_K);
+
+ probe.start(0);
+ init();
+ completion_type prefix;
+ byte_range suffix;
+ constexpr bool must_find_prefix = false;
+ parse(m_dictionary, query, prefix, suffix, must_find_prefix);
+ probe.stop(0);
+
+ probe.start(1);
+ range suffix_lex_range = m_dictionary.locate_prefix(suffix);
+ if (suffix_lex_range.is_invalid()) return m_pool.begin();
+ suffix_lex_range.begin += 1;
+ suffix_lex_range.end += 1;
+ uint32_t num_completions =
+ conjunctive_topk(prefix, suffix_lex_range, k);
+ probe.stop(1);
+
+ probe.start(2);
+ extract_completions(num_completions);
+ auto it = extract_strings(num_completions);
+ probe.stop(2);
+
+ return it;
+ }
+
+ // iterator_type topk(std::string const& query, const uint32_t k) {
+ // assert(k <= constants::MAX_K);
+ // init();
+ // completion_type prefix;
+ // byte_range suffix;
+ // parse(m_dictionary, query, prefix, suffix);
+
+ // range suffix_lex_range = m_dictionary.locate_prefix(suffix);
+ // if (suffix_lex_range.is_invalid()) return m_pool.begin();
+
+ // suffix_lex_range.begin += 1;
+ // suffix_lex_range.end += 1;
+ // range r = m_completions.locate_prefix(prefix, suffix_lex_range);
+
+ // uint32_t num_completions = 0;
+ // if (r.is_valid()) {
+ // num_completions = m_unsorted_docs_list.topk(r, k,
+ // m_pool.scores());
+ // }
+
+ // if (num_completions < k) {
+ // num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
+ // }
+
+ // extract_completions(num_completions);
+ // return extract_strings(num_completions);
+ // }
+
+ size_t bytes() const {
+ return m_completions.bytes() + m_unsorted_docs_list.bytes() +
+ m_dictionary.bytes() + m_docid_to_lexid.bytes() +
+ m_inverted_index.bytes();
+ }
+
+ void print_stats() const;
+
+ template
+ void visit(Visitor& visitor) {
+ visitor.visit(m_completions);
+ visitor.visit(m_unsorted_docs_list);
+ visitor.visit(m_dictionary);
+ visitor.visit(m_inverted_index);
+ visitor.visit(m_docid_to_lexid);
+ }
+
+private:
+ Completions m_completions;
+ unsorted_list_type m_unsorted_docs_list;
+ Dictionary m_dictionary;
+ BlockedInvertedIndex m_inverted_index;
+ compact_vector m_docid_to_lexid;
+
+ scored_string_pool m_pool;
+ completion_set m_topk_completion_set;
+
+ void init() {
+ m_pool.clear();
+ m_pool.init();
+ assert(m_pool.size() == 0);
+ }
+
+ void extract_completions(const uint32_t num_completions) {
+ auto const& topk_scores = m_pool.scores();
+ auto& completions = m_topk_completion_set.completions();
+ auto& sizes = m_topk_completion_set.sizes();
+ for (uint32_t i = 0; i != num_completions; ++i) {
+ auto doc_id = topk_scores[i];
+ auto lex_id = m_docid_to_lexid[doc_id];
+ uint8_t size = m_completions.extract(lex_id, completions[i]);
+ sizes[i] = size;
+ }
+ }
+
+ typedef typename BlockedInvertedIndex::block_type block_t;
+
+ struct block_type_comparator {
+ bool operator()(block_t& l, block_t& r) {
+ return l.docs_iterator.operator*() > r.docs_iterator.operator*();
+ }
+ };
+
+ uint32_t conjunctive_topk(completion_type& prefix, const range suffix,
+ const uint32_t k) {
+ auto& topk_scores = m_pool.scores();
+
+ typedef min_heap
+ min_priority_queue_type;
+ min_priority_queue_type q;
+ uint32_t current_block_id = m_inverted_index.block_id(suffix.begin);
+ uint32_t current_block_boundary =
+ m_inverted_index.block_boundary(current_block_id);
+ for (uint32_t i = suffix.begin; i != suffix.end; ++i) {
+ assert(i > 0);
+ if (i > current_block_boundary) {
+ q.push_back(m_inverted_index.block(current_block_id));
+ current_block_id += 1;
+ current_block_boundary =
+ m_inverted_index.block_boundary(current_block_id);
+ }
+ }
+ q.push_back(m_inverted_index.block(current_block_id));
+ q.make_heap();
+
+ uint32_t results = 0;
+
+ auto check = [&](block_t& block, id_type doc_id) {
+ uint64_t pos = block.docs_iterator.position();
+ assert(block.docs_iterator.access(pos) == doc_id);
+ uint64_t begin = block.offsets_iterator.access(pos);
+ uint64_t end = block.offsets_iterator.access(pos + 1);
+ assert(end > begin);
+ for (uint64_t i = begin; i != end; ++i) {
+ auto t = block.terms_iterator.access(i) + block.lower_bound;
+ if (t > suffix.end) break;
+ if (suffix.contains(t)) {
+ topk_scores[results++] = doc_id;
+ break;
+ }
+ }
+ };
+
+ if (prefix.size() == 0) {
+ while (!q.empty()) {
+ auto& z = q.top();
+ auto doc_id = z.docs_iterator.operator*();
+ check(z, doc_id);
+ if (results == k) return results;
+ z.docs_iterator.next();
+ if (!z.docs_iterator.has_next()) q.pop();
+ q.heapify();
+ }
+ } else {
+ deduplicate(prefix);
+ auto it = m_inverted_index.intersection_iterator(prefix, suffix);
+ for (; it.has_next() and !q.empty(); ++it) {
+ auto doc_id = *it;
+ while (!q.empty()) {
+ auto& z = q.top();
+ auto val = z.docs_iterator.operator*();
+ if (val > doc_id) break;
+ if (val < doc_id) {
+ val = z.docs_iterator.next_geq(doc_id);
+ if (!z.docs_iterator.has_next()) {
+ q.pop();
+ } else {
+ q.heapify();
+ }
+ } else {
+ if (val == doc_id) {
+ check(z, doc_id);
+ if (results == k) return results;
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ return results;
+ }
+
+ iterator_type extract_strings(const uint32_t num_completions) {
+ auto const& completions = m_topk_completion_set.completions();
+ auto const& sizes = m_topk_completion_set.sizes();
+ for (uint32_t i = 0; i != num_completions; ++i) {
+ auto const& c = completions[i];
+ uint32_t size = sizes[i];
+ uint64_t offset = m_pool.bytes();
+ uint8_t* decoded = m_pool.data() + offset;
+ for (uint32_t j = 0; j != size; ++j) {
+ auto term_id = c[j];
+ uint8_t len = m_dictionary.extract(term_id, decoded);
+ decoded += len;
+ offset += len;
+ if (j != size - 1) {
+ *decoded++ = ' ';
+ offset++;
+ }
+ }
+ m_pool.push_back_offset(offset);
+ }
+ assert(m_pool.size() == num_completions);
+ return m_pool.begin();
+ }
+};
+
+} // namespace autocomplete
\ No newline at end of file
diff --git a/archive/include/autocomplete_common.hpp b/archive/include/autocomplete_common.hpp
new file mode 100644
index 0000000..21d952b
--- /dev/null
+++ b/archive/include/autocomplete_common.hpp
@@ -0,0 +1,72 @@
+#pragma once
+
+#include "util_types.hpp"
+#include "min_heap.hpp"
+#include "unsorted_list.hpp"
+#include "minimal_docids.hpp"
+#include "succinct_rmq/cartesian_tree.hpp"
+
+namespace autocomplete {
+
+typedef unsorted_list unsorted_list_type;
+
+template
+bool parse(Dictionary const& dict, std::string const& query,
+ completion_type& prefix, byte_range& suffix, bool must_find_prefix) {
+ byte_range_iterator it(string_to_byte_range(query));
+ while (true) {
+ suffix = it.next();
+ if (!it.has_next()) break;
+ auto term_id = dict.locate(suffix);
+ if (term_id != global::invalid_term_id) {
+ prefix.push_back(term_id);
+ } else {
+ if (must_find_prefix) return false;
+ }
+ }
+ return true;
+}
+
+void deduplicate(completion_type& c) {
+ std::sort(c.begin(), c.end());
+ auto end = std::unique(c.begin(), c.end());
+ c.resize(std::distance(c.begin(), end));
+}
+
+template
+uint32_t heap_topk(InvertedIndex const& index, const range r, const uint32_t k,
+ std::vector& topk_scores) {
+ assert(r.is_valid());
+
+ typedef min_heap>
+ min_priority_queue_type;
+
+ min_priority_queue_type q;
+ q.reserve(r.end - r.begin + 1); // inclusive range
+ assert(r.begin > 0);
+ for (uint64_t term_id = r.begin; term_id <= r.end; ++term_id) {
+ q.push_back(index.iterator(term_id - 1));
+ }
+ q.make_heap();
+
+ uint32_t results = 0;
+
+ while (!q.empty()) {
+ auto& z = q.top();
+ auto doc_id = *z;
+ bool alread_present = std::binary_search(
+ topk_scores.begin(), topk_scores.begin() + results, doc_id);
+ if (!alread_present) {
+ topk_scores[results++] = doc_id;
+ if (results == k) return results;
+ }
+ z.next();
+ if (!z.has_next()) q.pop();
+ q.heapify();
+ }
+
+ return results;
+}
+
+} // namespace autocomplete
\ No newline at end of file
diff --git a/include/bit_vector.hpp b/archive/include/bit_vector.hpp
similarity index 98%
rename from include/bit_vector.hpp
rename to archive/include/bit_vector.hpp
index 676c112..4afb7dd 100644
--- a/include/bit_vector.hpp
+++ b/archive/include/bit_vector.hpp
@@ -242,12 +242,6 @@ struct bit_vector {
build(in);
}
- bit_vector& operator=(bit_vector const& other) {
- bit_vector tmp(other);
- tmp.swap(*this);
- return *this;
- }
-
void swap(bit_vector& other) {
std::swap(other.m_size, m_size);
other.m_bits.swap(m_bits);
@@ -412,6 +406,7 @@ struct bits_getter {
, m_base(offset)
, m_width(width)
, m_mask(-(width == 64) | ((uint64_t(1) << width) - 1)) {
+ assert(width > 0);
util::prefetch(m_data + m_base / 64);
}
diff --git a/include/blocked_inverted_index.hpp b/archive/include/blocked_inverted_index.hpp
similarity index 73%
rename from include/blocked_inverted_index.hpp
rename to archive/include/blocked_inverted_index.hpp
index 79319fe..2f1af3a 100644
--- a/include/blocked_inverted_index.hpp
+++ b/archive/include/blocked_inverted_index.hpp
@@ -21,9 +21,12 @@ struct blocked_inverted_index {
builder(parameters const& params, float c)
: m_num_integers(0)
- , m_num_docs(params.num_completions)
+ , m_num_docs(params.universe)
, m_num_terms(params.num_terms) {
- assert(c > 0.0);
+ if (!(c > 0.0 and c <= 1.0)) {
+ throw std::runtime_error("c must be in (0,1]");
+ }
+
essentials::logger("building blocked_inverted_index with c = " +
std::to_string(c) + "...");
@@ -115,6 +118,7 @@ struct blocked_inverted_index {
auto max =
*std::max_element(term_list.begin(), term_list.end());
uint64_t width = util::ceil_log2(max + 1);
+ if (width == 0) width = 1;
// std::cout << "using " << width << " [bpi]" << std::endl;
m_terms.append_bits(width, 6);
for (auto t : term_list) m_terms.append_bits(t, width);
@@ -248,6 +252,11 @@ struct blocked_inverted_index {
return id;
}
+ uint32_t block_boundary(uint32_t block_id) const {
+ assert(block_id < m_blocks.size());
+ return m_blocks[block_id];
+ }
+
struct block_type {
docs_iterator_type docs_iterator;
offsets_iterator_type offsets_iterator;
@@ -263,61 +272,45 @@ struct blocked_inverted_index {
: m_i(0)
, m_num_docs(ii->num_docs())
, m_suffix(r) {
- assert(!r.is_invalid());
-
- if (!term_ids.empty()) {
- m_iterators.reserve(term_ids.size()); // at most
- std::sort(term_ids.begin(), term_ids.end());
- uint32_t current_block_id = ii->block_id(term_ids.front());
- uint32_t i = 0;
- uint32_t prev_i = 0;
- for (; i != term_ids.size(); ++i) {
- auto term_id = term_ids[i];
- assert(term_id > 0);
- uint32_t b = ii->block_id(term_id);
- if (b > current_block_id) {
- auto block = ii->block(current_block_id);
- block.term_ids.reserve(term_ids.size()); // at most
- for (; prev_i != i; ++prev_i) {
- block.term_ids.push_back(term_ids[prev_i]);
- }
- m_iterators.push_back(std::move(block));
+ assert(r.is_valid());
+ assert(!term_ids.empty());
+ assert(std::is_sorted(term_ids.begin(), term_ids.end()));
+ assert(std::unique(term_ids.begin(), term_ids.end()) ==
+ term_ids.end());
+
+ m_blocks.reserve(term_ids.size()); // at most
+ uint32_t current_block_id = ii->block_id(term_ids.front());
+ uint32_t i = 0;
+ uint32_t prev_i = 0;
+ for (; i != term_ids.size(); ++i) {
+ auto term_id = term_ids[i];
+ assert(term_id > 0);
+ uint32_t b = ii->block_id(term_id);
+ if (b > current_block_id) {
+ auto block = ii->block(current_block_id);
+ block.term_ids.reserve(term_ids.size()); // at most
+ for (; prev_i != i; ++prev_i) {
+ block.term_ids.push_back(term_ids[prev_i]);
}
- current_block_id = b;
+ m_blocks.push_back(std::move(block));
}
-
- auto block = ii->block(current_block_id);
- block.term_ids.reserve(term_ids.size()); // at most
- for (; prev_i != i; ++prev_i) {
- block.term_ids.push_back(term_ids[prev_i]);
- }
- m_iterators.push_back(std::move(block));
-
- assert(m_iterators.size() > 0);
- std::sort(m_iterators.begin(), m_iterators.end(),
- [](auto const& l, auto const& r) {
- return l.docs_iterator.size() <
- r.docs_iterator.size();
- });
-
- m_candidate = m_iterators[0].docs_iterator.access(0);
- } else {
- m_candidate = 0;
+ current_block_id = b;
}
- {
- uint32_t current_block_id = ii->block_id(r.begin);
- uint32_t i = r.begin;
- for (; i != r.end; ++i) {
- assert(i > 0);
- uint32_t b = ii->block_id(i);
- if (b > current_block_id) {
- m_range.push_back(ii->block(current_block_id));
- }
- current_block_id = b;
- }
- m_range.push_back(ii->block(current_block_id));
+ auto block = ii->block(current_block_id);
+ block.term_ids.reserve(term_ids.size()); // at most
+ for (; prev_i != i; ++prev_i) {
+ block.term_ids.push_back(term_ids[prev_i]);
}
+ m_blocks.push_back(std::move(block));
+
+ std::sort(m_blocks.begin(), m_blocks.end(),
+ [](auto const& l, auto const& r) {
+ return l.docs_iterator.size() <
+ r.docs_iterator.size();
+ });
+
+ m_candidate = m_blocks[0].docs_iterator.access(0);
next();
}
@@ -331,62 +324,37 @@ struct blocked_inverted_index {
}
void operator++() {
- assert(m_i == m_iterators.size());
- if (!m_iterators.empty()) {
- if (m_iterators.size() > 1) {
- m_candidate = m_iterators[0].docs_iterator.next();
- }
- } else {
- m_candidate += 1;
+ assert(m_i == m_blocks.size());
+ if (m_blocks.size() > 1) {
+ m_candidate = m_blocks[0].docs_iterator.next();
}
m_i = 0;
next();
}
- bool intersects() {
- for (auto& block : m_range) {
- uint64_t val = block.docs_iterator.next_geq(m_candidate);
- if (val == m_candidate) {
- uint64_t pos = block.docs_iterator.position();
- assert(block.docs_iterator.access(pos) == m_candidate);
- uint64_t begin = block.offsets_iterator.access(pos);
- uint64_t end = block.offsets_iterator.access(pos + 1);
- assert(end > begin);
- uint32_t lower_bound = block.lower_bound;
- for (uint64_t i = begin; i != end; ++i) {
- auto t = block.terms_iterator.access(i) + lower_bound;
- if (t > m_suffix.end) break;
- if (m_suffix.contains(t)) return true;
- }
- }
- }
- return false;
- }
-
private:
id_type m_candidate;
size_t m_i;
uint64_t m_num_docs;
- std::vector m_iterators;
+ std::vector m_blocks;
std::vector m_range;
range m_suffix;
bool in() { // is candidate doc in intersection?
- uint64_t pos = m_iterators[m_i].docs_iterator.position();
- if (pos == m_iterators[m_i].docs_iterator.size()) return false;
- uint64_t begin = m_iterators[m_i].offsets_iterator.access(pos);
- uint64_t end = m_iterators[m_i].offsets_iterator.access(pos + 1);
+ auto& b = m_blocks[m_i];
+ uint64_t pos = b.docs_iterator.position();
+ if (pos == b.docs_iterator.size()) return false;
+ uint64_t begin = b.offsets_iterator.access(pos);
+ uint64_t end = b.offsets_iterator.access(pos + 1);
assert(end > begin);
- if (end - begin < m_iterators[m_i].term_ids.size()) return false;
+ if (end - begin < b.term_ids.size()) return false;
uint64_t i = begin;
- uint32_t lower_bound = m_iterators[m_i].lower_bound;
- for (auto x : m_iterators[m_i].term_ids) {
+ for (auto x : b.term_ids) {
bool found = false;
for (; i != end; ++i) {
- auto t =
- m_iterators[m_i].terms_iterator.access(i) + lower_bound;
+ auto t = b.terms_iterator.access(i) + b.lower_bound;
if (t == x) {
found = true;
break;
@@ -399,18 +367,17 @@ struct blocked_inverted_index {
}
void next() {
- if (m_iterators.empty()) return;
- if (m_iterators.size() == 1) {
- while (m_candidate < m_num_docs and m_i != m_iterators.size()) {
+ if (m_blocks.size() == 1) {
+ while (m_candidate < m_num_docs and m_i != m_blocks.size()) {
assert(m_i == 0);
- m_candidate = m_iterators[m_i].docs_iterator.next();
+ m_candidate = m_blocks[m_i].docs_iterator.next();
if (in()) ++m_i;
}
} else {
- while (m_candidate < m_num_docs and m_i != m_iterators.size()) {
+ while (m_candidate < m_num_docs and m_i != m_blocks.size()) {
// NOTE: since we work with unions of posting lists,
// next_geq by scan runs faster
- auto val = m_iterators[m_i].docs_iterator.next_geq_by_scan(
+ auto val = m_blocks[m_i].docs_iterator.next_geq_by_scan(
m_candidate);
bool is_in = in();
if (val == m_candidate and is_in) {
@@ -429,34 +396,6 @@ struct blocked_inverted_index {
return intersection_iterator_type(this, term_ids, r);
}
- template
- void visit(Visitor& visitor) {
- visitor.visit(m_num_integers);
- visitor.visit(m_num_docs);
- visitor.visit(m_num_terms);
- visitor.visit(m_blocks);
- visitor.visit(m_pointers_to_lists);
- visitor.visit(m_lists);
- visitor.visit(m_pointers_to_offsets);
- visitor.visit(m_offsets);
- visitor.visit(m_pointers_to_terms);
- visitor.visit(m_terms);
- }
-
-private:
- uint64_t m_num_integers;
- uint64_t m_num_docs;
- uint64_t m_num_terms;
-
- std::vector m_blocks;
-
- ef::ef_sequence m_pointers_to_lists;
- bit_vector m_lists;
- ef::ef_sequence m_pointers_to_offsets;
- bit_vector m_offsets;
- ef::ef_sequence m_pointers_to_terms;
- bit_vector m_terms;
-
block_type block(uint32_t block_id) const {
assert(block_id < num_blocks());
block_type b;
@@ -485,6 +424,34 @@ struct blocked_inverted_index {
return b;
}
+
+ template
+ void visit(Visitor& visitor) {
+ visitor.visit(m_num_integers);
+ visitor.visit(m_num_docs);
+ visitor.visit(m_num_terms);
+ visitor.visit(m_blocks);
+ visitor.visit(m_pointers_to_lists);
+ visitor.visit(m_lists);
+ visitor.visit(m_pointers_to_offsets);
+ visitor.visit(m_offsets);
+ visitor.visit(m_pointers_to_terms);
+ visitor.visit(m_terms);
+ }
+
+private:
+ uint64_t m_num_integers;
+ uint64_t m_num_docs;
+ uint64_t m_num_terms;
+
+ std::vector m_blocks;
+
+ ef::ef_sequence m_pointers_to_lists;
+ bit_vector m_lists;
+ ef::ef_sequence m_pointers_to_offsets;
+ bit_vector m_offsets;
+ ef::ef_sequence m_pointers_to_terms;
+ bit_vector m_terms;
};
} // namespace autocomplete
\ No newline at end of file
diff --git a/archive/include/building_util.hpp b/archive/include/building_util.hpp
new file mode 100644
index 0000000..0398879
--- /dev/null
+++ b/archive/include/building_util.hpp
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "util.hpp"
+#include "bit_vector.hpp"
+
+namespace autocomplete {
+namespace util {
+
+std::vector invert(std::vector const& docid_to_lexid,
+ uint64_t size) {
+ std::vector lexid_to_docid(size);
+ for (uint64_t doc_id = 0; doc_id != docid_to_lexid.size(); ++doc_id) {
+ if (docid_to_lexid[doc_id] < size) {
+ lexid_to_docid[docid_to_lexid[doc_id]] = doc_id;
+ }
+ }
+ return lexid_to_docid;
+}
+
+void push_pad(bit_vector_builder& bvb, uint64_t alignment = 8) {
+ uint64_t mod = bvb.size() % alignment;
+ if (mod) {
+ uint64_t pad = alignment - mod;
+ bvb.append_bits(0, pad);
+ assert(bvb.size() % alignment == 0);
+ }
+}
+
+void eat_pad(bits_iterator& it, uint64_t alignment = 8) {
+ uint64_t mod = it.position() % alignment;
+ if (mod) {
+ uint64_t pad = alignment - mod;
+ it.get_bits(pad);
+ assert(it.position() % alignment == 0);
+ }
+}
+
+} // namespace util
+} // namespace autocomplete
\ No newline at end of file
diff --git a/include/compact_forward_index.hpp b/archive/include/compact_forward_index.hpp
similarity index 92%
rename from include/compact_forward_index.hpp
rename to archive/include/compact_forward_index.hpp
index 74ad769..50267f4 100644
--- a/include/compact_forward_index.hpp
+++ b/archive/include/compact_forward_index.hpp
@@ -14,25 +14,25 @@ struct compact_forward_index {
: m_num_integers(0)
, m_num_terms(params.num_terms) {
essentials::logger("building forward_index...");
- uint64_t num_completions = params.num_completions;
+ uint64_t universe = params.universe;
std::ifstream input(
(params.collection_basename + ".forward").c_str(),
std::ios_base::in);
-
- std::vector terms;
- terms.reserve(params.num_completions *
+ std::vector terms;
+ terms.reserve(universe *
constants::MAX_NUM_TERMS_PER_QUERY); // at most
uint64_t size = 0;
m_pointers.push_back(0);
- for (uint64_t i = 0; i != num_completions; ++i) {
+ for (uint64_t i = 0; i != universe; ++i) {
uint32_t n = 0;
input >> n;
- assert(n > 0 and n < constants::MAX_NUM_TERMS_PER_QUERY);
+ assert(n < constants::MAX_NUM_TERMS_PER_QUERY);
m_num_integers += n;
size += n;
for (uint64_t k = 0; k != n; ++k) {
id_type x;
input >> x;
+ assert(x > 0);
terms.push_back(x);
}
m_pointers.push_back(size);
@@ -90,6 +90,7 @@ struct compact_forward_index {
bool intersects(const range r) const {
for (uint64_t i = 0; i != size(); ++i) {
auto val = m_cv[m_base + i];
+ assert(val > 0);
if (r.contains(val)) return true;
}
return false;
@@ -103,6 +104,7 @@ struct compact_forward_index {
};
forward_list_iterator_type iterator(id_type doc_id) {
+ assert(doc_id < num_docs());
uint64_t pos = m_pointers.access(doc_id);
uint64_t n = m_pointers.access(doc_id + 1) - pos;
return {m_data, pos, n};
diff --git a/include/compact_vector.hpp b/archive/include/compact_vector.hpp
similarity index 89%
rename from include/compact_vector.hpp
rename to archive/include/compact_vector.hpp
index f0cd1bd..ac8e275 100644
--- a/include/compact_vector.hpp
+++ b/archive/include/compact_vector.hpp
@@ -73,26 +73,33 @@ struct compact_vector {
};
struct builder {
- builder(uint64_t n = 0, uint64_t w = 0)
+ builder()
+ : m_back(0)
+ , m_cur_block(0)
+ , m_cur_shift(0) {}
+
+ builder(uint64_t n, uint64_t w)
: m_size(n)
- , m_width(!w ? w + 1 : w)
+ , m_width(w)
, m_mask(-(w == 64) | ((1ULL << w) - 1))
, m_back(0)
, m_cur_block(0)
, m_cur_shift(0)
, m_bits(essentials::words_for(m_size * m_width), 0) {
- if (m_width > 64) {
- throw std::runtime_error("width must be <= 64");
+ if (m_width == 0 or m_width > 64) {
+ throw std::runtime_error("width must be > 0 and <= 64");
}
}
void resize(size_t n, uint64_t w) {
m_size = n;
- m_width = !w ? w + 1 : w;
- if (m_width > 64) {
- throw std::runtime_error("width must be <= 64");
+ m_width = w;
+ if (m_width == 0 or m_width > 64) {
+ throw std::runtime_error("width must be > 0 and <= 64");
}
m_mask = -(w == 64) | ((uint64_t(1) << w) - 1);
+ std::cout << "using " << essentials::words_for(m_size * m_width)
+ << " words" << std::endl;
m_bits.resize(essentials::words_for(m_size * m_width), 0);
}
@@ -108,7 +115,7 @@ struct compact_vector {
throw std::runtime_error("width must be greater than 0");
}
- for (uint64_t i = 0; i < n; ++i, ++begin) {
+ for (uint64_t i = 0; i != n; ++i, ++begin) {
push_back(*begin);
}
}
@@ -220,8 +227,13 @@ struct compact_vector {
void build(Iterator begin, uint64_t n) {
uint64_t max = *std::max_element(begin, begin + n);
uint64_t width = util::ceil_log2(max + 1);
- std::cout << "\tusing " << width << " [bpi]" << std::endl;
- compact_vector::builder builder(begin, n, width);
+ build(begin, n, width);
+ }
+
+ template
+ void build(Iterator begin, uint64_t n, uint64_t w) {
+ std::cout << "\tusing " << w << " [bpi]" << std::endl;
+ compact_vector::builder builder(begin, n, w);
builder.build(*this);
}
@@ -277,7 +289,7 @@ struct compact_vector {
}
uint64_t find(const range r, uint64_t id) {
- assert(!r.is_invalid());
+ assert(r.is_valid());
assert(r.end <= size());
return util::find(*this, id, r.begin, r.end - 1);
}
@@ -312,4 +324,5 @@ struct compact_vector {
uint64_t m_mask;
std::vector m_bits;
};
+
} // namespace autocomplete
diff --git a/include/completion_trie.hpp b/archive/include/completion_trie.hpp
similarity index 97%
rename from include/completion_trie.hpp
rename to archive/include/completion_trie.hpp
index 8ae9036..2bc68ea 100644
--- a/include/completion_trie.hpp
+++ b/archive/include/completion_trie.hpp
@@ -166,16 +166,16 @@ struct completion_trie {
completion_trie() {}
// If the last token of the query is not completely specified,
- // then we search for its lexicographic range among the children of c.
+ // then we search for its lexicographic range among the children of prefix.
// Return [a,b)
- range locate_prefix(completion_type const& c,
+ range locate_prefix(completion_type const& prefix,
range suffix_lex_range) const {
- range r{global::not_found, global::not_found};
+ range r = global::invalid_range;
range pointer{0, m_nodes.front().size()};
uint32_t i = 0;
- for (; i < c.size(); ++i) {
- uint64_t pos = m_nodes[i].find(pointer, c[i]);
+ for (; i < prefix.size(); ++i) {
+ uint64_t pos = m_nodes[i].find(pointer, prefix[i]);
if (pos == global::not_found) return global::invalid_range;
pointer = m_pointers[i][pos];
}
@@ -195,10 +195,11 @@ struct completion_trie {
r.end += size;
}
- assert(r.end > r.begin);
+ assert(r.is_valid());
return r;
}
+ // NOTE: not used
bool is_member(completion_type const& c) const {
assert(c.size() > 0);
range pointer{0, m_nodes.front().size()};
diff --git a/include/constants.hpp b/archive/include/constants.hpp
similarity index 100%
rename from include/constants.hpp
rename to archive/include/constants.hpp
diff --git a/include/ef/compact_ef.hpp b/archive/include/ef/compact_ef.hpp
similarity index 100%
rename from include/ef/compact_ef.hpp
rename to archive/include/ef/compact_ef.hpp
diff --git a/include/ef/darray.hpp b/archive/include/ef/darray.hpp
similarity index 100%
rename from include/ef/darray.hpp
rename to archive/include/ef/darray.hpp
diff --git a/include/ef/ef_parameters.hpp b/archive/include/ef/ef_parameters.hpp
similarity index 100%
rename from include/ef/ef_parameters.hpp
rename to archive/include/ef/ef_parameters.hpp
diff --git a/include/ef/ef_sequence.hpp b/archive/include/ef/ef_sequence.hpp
similarity index 93%
rename from include/ef/ef_sequence.hpp
rename to archive/include/ef/ef_sequence.hpp
index 10970d6..2e9e293 100644
--- a/include/ef/ef_sequence.hpp
+++ b/archive/include/ef/ef_sequence.hpp
@@ -49,6 +49,7 @@ struct ef_sequence {
++within;
}
assert(values.size() == n);
+ assert(std::is_sorted(values.begin(), values.end()));
compress(values.begin(), values.size(), values.back());
}
@@ -142,33 +143,25 @@ struct ef_sequence {
}
uint64_t find(const range r, uint64_t id) const {
- assert(!r.is_invalid());
+ assert(r.is_valid());
assert(r.end <= size());
uint64_t prev_upper = previous_range_upperbound(r);
return util::find(*this, id + prev_upper, r.begin, r.end - 1);
}
range find(const range r, const range lex) const {
- assert(!r.is_invalid());
+ assert(r.is_valid());
assert(r.end <= size());
auto prev_upper = previous_range_upperbound(r);
-
- uint64_t begin =
- util::next_geq(*this, lex.begin + prev_upper, r.begin, r.end - 1);
- if (begin == global::not_found) {
+ uint64_t id_begin = lex.begin + prev_upper;
+ uint64_t id_end = lex.end + prev_upper;
+ uint64_t begin = util::next_geq(*this, id_begin, r.begin, r.end - 1);
+ if (begin == global::not_found or access(begin) > id_end) {
return {r.end, r.end};
}
-
- if (lex.begin == lex.end) {
- return {begin, begin + 1};
- }
-
- uint64_t id_end = lex.end + prev_upper;
+ if (lex.begin == lex.end) return {begin, begin + 1};
uint64_t end = util::next_geq(*this, id_end, begin, r.end - 1);
- if (end == global::not_found) {
- return {begin, r.end};
- }
-
+ if (end == global::not_found) return {begin, r.end};
return {begin, access(end) != id_end ? end : end + 1};
}
@@ -251,7 +244,7 @@ struct ef_sequence {
}
uint64_t previous_range_upperbound(const range r) const {
- assert(!r.is_invalid());
+ assert(r.is_valid());
return r.begin ? access(r.begin - 1) : 0;
}
};
diff --git a/include/fc_dictionary.hpp b/archive/include/fc_dictionary.hpp
similarity index 95%
rename from include/fc_dictionary.hpp
rename to archive/include/fc_dictionary.hpp
index 271f970..52e3971 100644
--- a/include/fc_dictionary.hpp
+++ b/archive/include/fc_dictionary.hpp
@@ -37,14 +37,17 @@ struct fc_dictionary {
std::string curr;
std::string header;
+ uint64_t total_characters = 0;
for (uint32_t b = 0; b != buckets; ++b) {
input >> header;
+ total_characters += header.size();
write_header(header);
m_pointers_to_headers.push_back(m_headers.size());
prev.swap(header);
uint32_t size = b != buckets - 1 ? BucketSize : tail;
for (uint32_t i = 0; i != size; ++i) {
input >> curr;
+ total_characters += curr.size();
uint32_t l = 0; // |lcp(curr,prev)|
while (l != curr.size() and l != prev.size() and
curr[l] == prev[l]) {
@@ -61,6 +64,9 @@ struct fc_dictionary {
m_buckets.push_back(0);
}
+ std::cout << static_cast(total_characters) / m_size
+ << " characters per string" << std::endl;
+
input.close();
essentials::logger("DONE");
}
@@ -109,6 +115,7 @@ struct fc_dictionary {
fc_dictionary() {}
// NOTE: return inclusive ranges, i.e., [a,b]
+ // 0-based ids
range locate_prefix(byte_range p) const {
if (p.end - p.begin == 0) return {0, size() - 1};
auto bucket_id = locate_buckets(p);
@@ -223,7 +230,7 @@ struct fc_dictionary {
if (cmp < 0) {
bucket_id = mi;
} else {
- bucket_id = mi - 1;
+ bucket_id = hi == -1 ? 0 : hi;
h = header(bucket_id);
}
@@ -307,10 +314,13 @@ struct fc_dictionary {
// NOTE 1: excluding null terminators, allow us to use memcpy here
// because we know exactly how many bytes to copy: this is much faster
- // than looping until we hit '\0'. NOTE 2: always copying a fixed amount
+ // than looping until we hit '\0'.
+
+ // NOTE 2: always copying a fixed amount
// of bytes (constants::MAX_NUM_CHARS_PER_QUERY) is much faster than
// copying an exact amount, e.g., suffix_len (althoung it could be
// less), so do not do: memcpy(out+ l, in, suffix_len).
+
memcpy(out + l, in, constants::MAX_NUM_CHARS_PER_QUERY);
return l + suffix_len;
@@ -340,8 +350,7 @@ struct fc_dictionary {
if (cmp < 0) return global::invalid_term_id;
curr += l - lcp_len + 2;
}
- assert(false);
- __builtin_unreachable();
+ return global::invalid_term_id; // term does not exist in dictionary
}
id_type left_locate(byte_range p, byte_range h, id_type bucket_id) const {
diff --git a/include/integer_codes.hpp b/archive/include/integer_codes.hpp
similarity index 100%
rename from include/integer_codes.hpp
rename to archive/include/integer_codes.hpp
diff --git a/include/integer_fc_dictionary.hpp b/archive/include/integer_fc_dictionary.hpp
similarity index 89%
rename from include/integer_fc_dictionary.hpp
rename to archive/include/integer_fc_dictionary.hpp
index 218cacf..29d8743 100644
--- a/include/integer_fc_dictionary.hpp
+++ b/archive/include/integer_fc_dictionary.hpp
@@ -19,7 +19,7 @@ struct integer_fc_dictionary {
essentials::logger(
"building integer_fc_dictionary with bucket size " +
std::to_string(BucketSize) + "...");
- m_doc_ids.reserve(params.num_completions);
+ m_docid_to_lexid.resize(params.universe, id_type(-1));
uint32_t buckets = std::ceil(double(m_size) / (BucketSize + 1));
m_pointers_to_buckets.reserve(buckets + 1);
@@ -35,9 +35,10 @@ struct integer_fc_dictionary {
std::ios_base::in);
completion_iterator it(params, input);
+ id_type lex_id = 0;
for (uint32_t b = 0; b != buckets; ++b) {
auto& header = *it;
- m_doc_ids.push_back(header.doc_id);
+ m_docid_to_lexid[header.doc_id] = lex_id++;
write_header(header.completion);
m_pointers_to_headers.push_back(m_headers.size());
completion_type prev;
@@ -47,7 +48,7 @@ struct integer_fc_dictionary {
for (uint32_t i = 0; i != size; ++i, ++it) {
auto& record = *it;
auto& curr = record.completion;
- m_doc_ids.push_back(record.doc_id);
+ m_docid_to_lexid[record.doc_id] = lex_id++;
uint32_t l = 0; // |lcp(curr,prev)|
while (l != curr.size() and l != prev.size() and
curr[l] == prev[l]) {
@@ -76,7 +77,7 @@ struct integer_fc_dictionary {
other.m_pointers_to_buckets.swap(m_pointers_to_buckets);
other.m_headers.swap(m_headers);
other.m_buckets.swap(m_buckets);
- other.m_doc_ids.swap(m_doc_ids);
+ other.m_docid_to_lexid.swap(m_docid_to_lexid);
}
void build(integer_fc_dictionary& d) {
@@ -88,8 +89,8 @@ struct integer_fc_dictionary {
builder().swap(*this);
}
- std::vector& doc_ids() {
- return m_doc_ids;
+ std::vector& docid_to_lexid() {
+ return m_docid_to_lexid;
}
private:
@@ -98,7 +99,7 @@ struct integer_fc_dictionary {
std::vector m_pointers_to_buckets;
std::vector m_headers;
std::vector m_buckets;
- std::vector m_doc_ids;
+ std::vector m_docid_to_lexid;
void write_header(completion_type const& c) {
assert(c.size() > 0 and
@@ -166,19 +167,20 @@ struct integer_fc_dictionary {
prefix.push_back(global::invalid_term_id);
}
- locate_bucket(completion_to_uint32_range(prefix), h_end, bucket_id_end,
- bucket_id_begin // hint
+ locate_right_bucket(completion_to_uint32_range(prefix), h_end,
+ bucket_id_end,
+ bucket_id_begin // hint
);
uint32_t p_end = bucket_id_end * (BucketSize + 1);
p_end += right_locate(completion_to_uint32_range(prefix), h_end,
bucket_id_end);
+ prefix.pop_back();
+
if (p_end < p_begin) {
- prefix.pop_back();
return global::invalid_range;
}
- prefix.pop_back();
if (suffix_lex_range.begin == suffix_lex_range.end) {
prefix.pop_back();
}
@@ -269,13 +271,37 @@ struct integer_fc_dictionary {
if (cmp < 0) {
bucket_id = mi;
} else {
- bucket_id = mi - 1;
+ bucket_id = hi == -1 ? 0 : hi;
h = header(bucket_id);
}
return false;
}
+ void locate_right_bucket(uint32_range t, uint32_range& h,
+ id_type& bucket_id,
+ int lower_bound_hint = 0) const {
+ int lo = lower_bound_hint, hi = buckets() - 1, mi = 0, cmp = 0;
+ size_t n = t.end - t.begin;
+ while (lo <= hi) {
+ mi = (lo + hi) / 2;
+ h = header(mi);
+ cmp = uint32_range_compare(h, t, n);
+ if (cmp > 0) {
+ hi = mi - 1;
+ } else if (cmp <= 0) {
+ lo = mi + 1;
+ }
+ }
+
+ if (cmp < 0) {
+ bucket_id = mi;
+ } else {
+ bucket_id = hi == -1 ? 0 : hi;
+ h = header(bucket_id);
+ }
+ }
+
#define INT_FC_DICT_LOCATE_INIT \
static uint32_t decoded[2 * constants::MAX_NUM_TERMS_PER_QUERY]; \
memcpy(decoded, h.begin, (h.end - h.begin) * sizeof(uint32_t)); \
diff --git a/include/inverted_index.hpp b/archive/include/inverted_index.hpp
similarity index 88%
rename from include/inverted_index.hpp
rename to archive/include/inverted_index.hpp
index 7c84bd7..900fd96 100644
--- a/include/inverted_index.hpp
+++ b/archive/include/inverted_index.hpp
@@ -16,7 +16,7 @@ struct inverted_index {
builder(parameters const& params)
: m_num_integers(0)
- , m_num_docs(params.num_completions) {
+ , m_num_docs(params.universe) {
essentials::logger("building inverted_index...");
uint64_t num_terms = params.num_terms;
@@ -28,10 +28,18 @@ struct inverted_index {
std::vector list;
m_pointers.push_back(0);
+
+ uint32_t max_list_size = 0;
+ uint32_t min_list_size = uint32_t(-1);
+
for (uint64_t i = 0; i != num_terms; ++i) {
list.clear();
uint32_t n = 0;
input >> n;
+
+ if (n > max_list_size) max_list_size = n;
+ if (n < min_list_size) min_list_size = n;
+
list.reserve(n);
m_num_integers += n;
for (uint64_t k = 0; k != n; ++k) {
@@ -41,11 +49,17 @@ struct inverted_index {
}
m_minimal_doc_ids.push_back(list.front());
write_gamma_nonzero(m_bvb, n);
- if (ListType::is_byte_aligned) util::push_pad(m_bvb);
+ if constexpr (ListType::is_byte_aligned) util::push_pad(m_bvb);
ListType::build(m_bvb, list.begin(), m_num_docs, list.size());
m_pointers.push_back(m_bvb.size());
}
+ std::cout << "avg. list size = "
+ << static_cast(m_num_integers) / num_terms
+ << std::endl;
+ std::cout << "max_list_size = " << max_list_size << std::endl;
+ std::cout << "min_list_size = " << min_list_size << std::endl;
+
m_pointers.pop_back();
input.close();
essentials::logger("DONE");
@@ -86,7 +100,7 @@ struct inverted_index {
uint64_t offset = m_pointers.access(term_id);
bits_iterator it(m_data, offset);
uint64_t n = read_gamma_nonzero(it);
- if (ListType::is_byte_aligned) util::eat_pad(it);
+ if constexpr (ListType::is_byte_aligned) util::eat_pad(it);
return {m_data, it.position(), m_num_docs, n};
}
diff --git a/include/min_heap.hpp b/archive/include/min_heap.hpp
similarity index 100%
rename from include/min_heap.hpp
rename to archive/include/min_heap.hpp
diff --git a/archive/include/minimal_docids.hpp b/archive/include/minimal_docids.hpp
new file mode 100644
index 0000000..a7cb8f8
--- /dev/null
+++ b/archive/include/minimal_docids.hpp
@@ -0,0 +1,131 @@
+#pragma once
+
+#include "compact_vector.hpp"
+#include "util_types.hpp"
+
+namespace autocomplete {
+
+template
+struct minimal_docids {
+ static const uint32_t SCAN_THRESHOLD = 64;
+ typedef scored_range_with_list_iterator<
+ typename InvertedIndex::iterator_type>
+ range_type;
+ typedef scored_range_with_list_iterator_comparator<
+ typename range_type::iterator_type>
+ comparator_range_type;
+
+ minimal_docids() {}
+
+ void build(std::vector const& list) {
+ essentials::logger("building minimal_docids...");
+ m_rmq.build(list, std::less());
+ m_list.build(list.begin(), list.size());
+ essentials::logger("DONE");
+ }
+
+ uint32_t topk(InvertedIndex const& index, const range r, const uint32_t k,
+ std::vector& topk_scores) {
+ range_type sr;
+ sr.r = {r.begin, r.end - 1}; // rmq needs inclusive ranges
+ sr.min_pos = m_rmq.rmq(sr.r.begin, sr.r.end);
+ sr.min_val = m_list.access(sr.min_pos);
+
+ m_q.clear();
+ m_q.push(sr);
+
+ uint32_t results = 0;
+ while (!m_q.empty()) {
+ auto& min = m_q.top();
+ auto docid = min.minimum();
+ bool alread_present = std::binary_search(
+ topk_scores.begin(), topk_scores.begin() + results, docid);
+ if (!alread_present) {
+ topk_scores[results++] = docid;
+ if (results == k) break;
+ }
+
+ if (min.is_open()) {
+ min.iterator.next();
+ if (!min.iterator.has_next()) {
+ m_q.pop();
+ }
+ m_q.heapify();
+ } else {
+ // save
+ auto min_range = min.r;
+ auto min_pos = min.min_pos;
+
+ min.set_iterator(index);
+ min.iterator.next();
+ if (!min.iterator.has_next()) {
+ m_q.pop();
+ }
+
+ m_q.heapify();
+
+ if (min_pos > 0 and min_pos - 1 >= min_range.begin) {
+ range_type left;
+ left.r = {min_range.begin, min_pos - 1};
+ if (left.r.end - left.r.begin <= SCAN_THRESHOLD) {
+ left.min_pos = rmq(left.r.begin, left.r.end);
+ } else {
+ left.min_pos = m_rmq.rmq(left.r.begin, left.r.end);
+ }
+ left.min_val = m_list.access(left.min_pos);
+ m_q.push(left);
+ }
+
+ if (min_pos < size() - 1 and min_range.end >= min_pos + 1) {
+ range_type right;
+ right.r = {min_pos + 1, min_range.end};
+ if (right.r.end - right.r.begin <= SCAN_THRESHOLD) {
+ right.min_pos = rmq(right.r.begin, right.r.end);
+ } else {
+ right.min_pos = m_rmq.rmq(right.r.begin, right.r.end);
+ }
+ right.min_val = m_list.access(right.min_pos);
+ m_q.push(right);
+ }
+ }
+ }
+
+ return results;
+ }
+
+ size_t size() const {
+ return m_list.size();
+ }
+
+ size_t bytes() const {
+ return m_rmq.bytes() + m_list.bytes();
+ }
+
+ template
+ void visit(Visitor& visitor) {
+ visitor.visit(m_rmq);
+ visitor.visit(m_list);
+ }
+
+private:
+ typedef min_heap min_priority_queue_type;
+ min_priority_queue_type m_q;
+
+ RMQ m_rmq;
+ compact_vector m_list;
+
+ uint64_t rmq(uint64_t lo, uint64_t hi) { // inclusive endpoints
+ uint64_t pos = lo;
+ id_type min = id_type(-1);
+ for (uint64_t i = lo; i <= hi; ++i) {
+ id_type val = m_list.access(i);
+ if (val < min) {
+ min = val;
+ pos = i;
+ }
+ }
+ return pos;
+ }
+};
+
+} // namespace autocomplete
\ No newline at end of file
diff --git a/include/parameters.hpp b/archive/include/parameters.hpp
similarity index 81%
rename from include/parameters.hpp
rename to archive/include/parameters.hpp
index db44d71..d628d25 100644
--- a/include/parameters.hpp
+++ b/archive/include/parameters.hpp
@@ -24,10 +24,12 @@ struct parameters {
input >> num_terms;
input >> max_string_length;
input >> num_completions;
+ input >> universe;
input >> num_levels;
assert(num_terms > 0);
assert(max_string_length > 0);
assert(num_completions > 0);
+ assert(universe >= num_completions);
assert(num_levels > 0);
if (max_string_length > constants::MAX_NUM_CHARS_PER_QUERY) {
@@ -41,14 +43,18 @@ struct parameters {
}
nodes_per_level.resize(num_levels, 0);
- for (uint32_t i = 0; i != num_levels; ++i) {
- input >> nodes_per_level[i];
+ uint32_t i = 0;
+ for (; i != num_levels and input; ++i) input >> nodes_per_level[i];
+ if (i != num_levels) {
+ throw std::runtime_error(
+ "File with statistics may be truncated or malformed");
}
}
uint32_t num_terms;
uint32_t max_string_length;
uint32_t num_completions;
+ uint32_t universe;
uint32_t num_levels;
std::vector nodes_per_level;
std::string collection_basename;
diff --git a/archive/include/probe.hpp b/archive/include/probe.hpp
new file mode 100644
index 0000000..955a939
--- /dev/null
+++ b/archive/include/probe.hpp
@@ -0,0 +1,36 @@
+#pragma once
+
+#include
+#include "util_types.hpp"
+
+namespace autocomplete {
+
+struct nop_probe {
+ inline void start(uint64_t) {}
+ inline void stop(uint64_t) {}
+};
+
+struct timer_probe {
+ timer_probe(uint64_t n)
+ : m_timers(n) {}
+
+ inline void start(uint64_t i) {
+ assert(i < m_timers.size());
+ m_timers[i].start();
+ }
+
+ inline void stop(uint64_t i) {
+ assert(i < m_timers.size());
+ m_timers[i].stop();
+ }
+
+ timer_type const& get(uint64_t i) {
+ assert(i < m_timers.size());
+ return m_timers[i];
+ }
+
+private:
+ std::vector m_timers;
+};
+
+} // namespace autocomplete
diff --git a/include/scored_string_pool.hpp b/archive/include/scored_string_pool.hpp
similarity index 87%
rename from include/scored_string_pool.hpp
rename to archive/include/scored_string_pool.hpp
index f834453..3f03f06 100644
--- a/include/scored_string_pool.hpp
+++ b/archive/include/scored_string_pool.hpp
@@ -4,6 +4,11 @@
namespace autocomplete {
+struct scored_byte_range {
+ byte_range string;
+ id_type score;
+};
+
struct scored_string_pool {
void init() {
push_back_offset(0);
@@ -39,6 +44,10 @@ struct scored_string_pool {
return m_scores;
}
+ std::vector const& const_scores() const {
+ return m_scores;
+ }
+
scored_byte_range operator[](size_t i) const {
assert(i < size());
scored_byte_range sbr;
@@ -69,6 +78,10 @@ struct scored_string_pool {
return m_pool->operator[](m_pos);
}
+ scored_string_pool const* pool() const {
+ return m_pool;
+ }
+
private:
scored_string_pool const* m_pool;
size_t m_pos;
diff --git a/include/statistics.hpp b/archive/include/statistics.hpp
similarity index 81%
rename from include/statistics.hpp
rename to archive/include/statistics.hpp
index a863814..42654ae 100644
--- a/include/statistics.hpp
+++ b/archive/include/statistics.hpp
@@ -10,7 +10,8 @@ namespace autocomplete {
void print(std::string const& what, size_t bytes, size_t total_bytes,
uint64_t num_completions) {
- std::cout << " " << what << ": " << convert(bytes, essentials::MiB)
+ std::cout << " " << what << ": "
+ << essentials::convert(bytes, essentials::MiB)
<< " [MiB]: " << static_cast(bytes) / num_completions
<< " [bytes per completion] ";
std::cout << "(" << (bytes * 100.0) / total_bytes << "%)" << std::endl;
@@ -31,20 +32,21 @@ template ::print_stats()
const {
size_t total_bytes = bytes();
- std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]"
- << std::endl;
+ std::cout << "using " << essentials::convert(total_bytes, essentials::MiB)
+ << " [MiB]" << std::endl;
print_bps("nodes", nodes_bytes(), size());
print_bps("pointers", pointers_bytes(), size());
print_bps("left extremes", left_extremes_bytes(), size());
print_bps("sizes", sizes_bytes(), size());
}
-template
-void autocomplete
+void autocomplete::print_stats() const {
size_t total_bytes = bytes();
- std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]: "
+ std::cout << "using " << essentials::convert(total_bytes, essentials::MiB)
+ << " [MiB]: "
<< static_cast(total_bytes) / m_completions.size()
<< " [bytes per completion] " << std::endl;
@@ -74,18 +76,22 @@ void autocomplete(m_forward_index.num_integers()) /
+ m_completions.size()
+ << std::endl;
print_bpi("data", m_forward_index.data_bytes(),
m_forward_index.num_integers());
print_bpi("pointers", m_forward_index.pointer_bytes(),
m_forward_index.num_integers());
}
-template
-void autocomplete2::print_stats() const {
+template
+void autocomplete2::print_stats()
+ const {
size_t total_bytes = bytes();
- std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]: "
+ std::cout << "using " << essentials::convert(total_bytes, essentials::MiB)
+ << " [MiB]: "
<< static_cast(total_bytes) / m_completions.size()
<< " [bytes per completion] " << std::endl;
@@ -115,12 +121,12 @@ void autocomplete2
-void autocomplete3::print_stats() const {
+template
+void autocomplete3::print_stats()
+ const {
size_t total_bytes = bytes();
- std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]: "
+ std::cout << "using " << essentials::convert(total_bytes, essentials::MiB)
+ << " [MiB]: "
<< static_cast(total_bytes) / m_completions.size()
<< " [bytes per completion] " << std::endl;
@@ -140,12 +146,13 @@ void autocomplete3
-void autocomplete4::print_stats() const {
+void autocomplete4::print_stats()
+ const {
size_t total_bytes = bytes();
- std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]: "
+ std::cout << "using " << essentials::convert(total_bytes, essentials::MiB)
+ << " [MiB]: "
<< static_cast(total_bytes) / m_completions.size()
<< " [bytes per completion] " << std::endl;
diff --git a/include/succinct_rmq/README.md b/archive/include/succinct_rmq/README.md
similarity index 100%
rename from include/succinct_rmq/README.md
rename to archive/include/succinct_rmq/README.md
diff --git a/include/succinct_rmq/bp_vector.hpp b/archive/include/succinct_rmq/bp_vector.hpp
similarity index 100%
rename from include/succinct_rmq/bp_vector.hpp
rename to archive/include/succinct_rmq/bp_vector.hpp
diff --git a/include/succinct_rmq/bp_vector_support.hpp b/archive/include/succinct_rmq/bp_vector_support.hpp
similarity index 100%
rename from include/succinct_rmq/bp_vector_support.hpp
rename to archive/include/succinct_rmq/bp_vector_support.hpp
diff --git a/include/succinct_rmq/cartesian_tree.hpp b/archive/include/succinct_rmq/cartesian_tree.hpp
similarity index 100%
rename from include/succinct_rmq/cartesian_tree.hpp
rename to archive/include/succinct_rmq/cartesian_tree.hpp
diff --git a/include/succinct_rmq/rs_bit_vector.hpp b/archive/include/succinct_rmq/rs_bit_vector.hpp
similarity index 100%
rename from include/succinct_rmq/rs_bit_vector.hpp
rename to archive/include/succinct_rmq/rs_bit_vector.hpp
diff --git a/archive/include/types.hpp b/archive/include/types.hpp
new file mode 100644
index 0000000..659199d
--- /dev/null
+++ b/archive/include/types.hpp
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "completion_trie.hpp"
+#include "fc_dictionary.hpp"
+#include "integer_fc_dictionary.hpp"
+#include "compact_forward_index.hpp"
+#include "inverted_index.hpp"
+#include "blocked_inverted_index.hpp"
+#include "autocomplete.hpp"
+#include "autocomplete2.hpp"
+#include "autocomplete3.hpp"
+#include "autocomplete4.hpp"
+#include "compact_vector.hpp"
+#include "ef/ef_sequence.hpp"
+#include "ef/compact_ef.hpp"
+
+namespace autocomplete {
+
+typedef uint_vec uint32_vec;
+typedef uint_vec uint64_vec;
+
+typedef completion_trie
+ ef_completion_trie;
+typedef fc_dictionary<> fc_dictionary_type;
+typedef integer_fc_dictionary<> integer_fc_dictionary_type;
+typedef inverted_index ef_inverted_index;
+typedef blocked_inverted_index ef_blocked_inverted_index;
+
+/* compressed indexes */
+typedef autocomplete
+ ef_autocomplete_type1;
+
+typedef autocomplete2
+ ef_autocomplete_type2;
+
+typedef autocomplete3
+ ef_autocomplete_type3;
+
+typedef autocomplete4
+ ef_autocomplete_type4;
+
+} // namespace autocomplete
\ No newline at end of file
diff --git a/include/uint_vec.hpp b/archive/include/uint_vec.hpp
similarity index 94%
rename from include/uint_vec.hpp
rename to archive/include/uint_vec.hpp
index 86d60c4..adeaa8c 100644
--- a/include/uint_vec.hpp
+++ b/archive/include/uint_vec.hpp
@@ -74,14 +74,14 @@ struct uint_vec {
}
uint64_t find(const range r, UintType id) const {
- assert(!r.is_invalid());
+ assert(r.is_valid());
assert(r.end <= size());
- UintType prev_upper = previous_range_upperbound(r);
+ auto prev_upper = previous_range_upperbound(r);
return util::find(*this, id + prev_upper, r.begin, r.end - 1);
}
range find(const range r, const range lex) const {
- assert(!r.is_invalid());
+ assert(r.is_valid());
assert(r.end <= size());
auto prev_upper = previous_range_upperbound(r);
@@ -131,9 +131,9 @@ struct uint_vec {
std::vector m_data;
UintType previous_range_upperbound(const range r) const {
- assert(!r.is_invalid());
+ assert(r.is_valid());
return r.begin ? access(r.begin - 1) : 0;
}
-}; // namespace autocomplete
+};
} // namespace autocomplete
\ No newline at end of file
diff --git a/include/uncompressed_list.hpp b/archive/include/uncompressed_list.hpp
similarity index 100%
rename from include/uncompressed_list.hpp
rename to archive/include/uncompressed_list.hpp
diff --git a/include/unsorted_list.hpp b/archive/include/unsorted_list.hpp
similarity index 78%
rename from include/unsorted_list.hpp
rename to archive/include/unsorted_list.hpp
index e7cfddd..bb06a86 100644
--- a/include/unsorted_list.hpp
+++ b/archive/include/unsorted_list.hpp
@@ -1,48 +1,10 @@
#pragma once
#include "compact_vector.hpp"
+#include "util_types.hpp"
namespace autocomplete {
-struct scored_byte_range {
- byte_range string;
- id_type score;
-};
-
-typedef std::function
- scored_range_comparator_type;
-scored_range_comparator_type scored_range_comparator =
- [](scored_range const& l, scored_range const& r) {
- return l.min_val > r.min_val;
- };
-
-struct topk_queue {
- void push(scored_range sr) {
- m_q.push_back(sr);
- std::push_heap(m_q.begin(), m_q.end(), scored_range_comparator);
- }
-
- scored_range top() {
- return m_q.front();
- }
-
- void pop() {
- std::pop_heap(m_q.begin(), m_q.end(), scored_range_comparator);
- m_q.pop_back();
- }
-
- void clear() {
- m_q.clear();
- }
-
- bool empty() const {
- return m_q.empty();
- }
-
-private:
- std::vector m_q;
-};
-
template
struct unsorted_list {
static const uint32_t SCAN_THRESHOLD = 64;
@@ -132,6 +94,40 @@ struct unsorted_list {
}
private:
+ struct topk_queue {
+ void push(scored_range sr) {
+ m_q.push_back(sr);
+ std::push_heap(m_q.begin(), m_q.end(), m_comparator);
+ }
+
+ scored_range top() {
+ return m_q.front();
+ }
+
+ void pop() {
+ std::pop_heap(m_q.begin(), m_q.end(), m_comparator);
+ m_q.pop_back();
+ }
+
+ void clear() {
+ m_q.clear();
+ }
+
+ bool empty() const {
+ return m_q.empty();
+ }
+
+ private:
+ std::vector m_q;
+
+ typedef std::function
+ scrored_range_comparator_type;
+ scrored_range_comparator_type m_comparator = [](scored_range const& l,
+ scored_range const& r) {
+ return scored_range::greater(l, r);
+ };
+ };
+
topk_queue m_q;
RMQ m_rmq;
compact_vector m_list;
diff --git a/include/util.hpp b/archive/include/util.hpp
similarity index 99%
rename from include/util.hpp
rename to archive/include/util.hpp
index bb20bdb..4f0b89e 100644
--- a/include/util.hpp
+++ b/archive/include/util.hpp
@@ -51,6 +51,7 @@ uint64_t find(S const& sequence, uint64_t id, uint64_t lo, uint64_t hi) {
if (val == id) {
return pos;
} else if (val > id) {
+ if (pos == 0) return global::not_found;
hi = pos - 1;
} else {
lo = pos + 1;
diff --git a/include/util_types.hpp b/archive/include/util_types.hpp
similarity index 81%
rename from include/util_types.hpp
rename to archive/include/util_types.hpp
index 7405378..0890002 100644
--- a/include/util_types.hpp
+++ b/archive/include/util_types.hpp
@@ -36,6 +36,7 @@ struct range {
uint64_t begin;
uint64_t end;
bool is_invalid() const;
+ bool is_valid() const;
bool contains(uint64_t val) const;
};
@@ -48,6 +49,10 @@ bool range::is_invalid() const {
end == global::invalid_range.end or begin > end;
}
+bool range::is_valid() const {
+ return !is_invalid();
+}
+
bool range::contains(uint64_t val) const {
if (val >= begin and val <= end) return true;
return false;
@@ -57,6 +62,55 @@ struct scored_range {
range r;
uint32_t min_pos;
id_type min_val;
+
+ static bool greater(scored_range const& l, scored_range const& r) {
+ return l.min_val > r.min_val;
+ }
+};
+
+template
+struct scored_range_with_list_iterator {
+ typedef Iterator iterator_type;
+
+ scored_range_with_list_iterator()
+ : min_pos(global::invalid_term_id)
+ , m_open(false) {}
+
+ range r;
+ uint32_t min_pos;
+ id_type min_val;
+ Iterator iterator;
+
+ bool is_open() const {
+ return m_open;
+ }
+
+ template
+ void set_iterator(InvertedIndex const& index) {
+ assert(min_pos != global::invalid_term_id);
+ m_open = true;
+ iterator = index.iterator(min_pos);
+ }
+
+ id_type minimum() const {
+ return is_open() ? *iterator : min_val;
+ }
+
+ // static bool greater(scored_range_with_list_iterator const& l,
+ // scored_range_with_list_iterator const& r) {
+ // return l.minimum() > r.minimum();
+ // }
+
+private:
+ bool m_open;
+};
+
+template
+struct scored_range_with_list_iterator_comparator {
+ bool operator()(scored_range_with_list_iterator const& l,
+ scored_range_with_list_iterator const& r) {
+ return l.minimum() > r.minimum();
+ }
};
struct byte_range {
@@ -237,25 +291,4 @@ struct timer {
typedef timer timer_type;
-struct iterator {
- iterator(id_type begin, id_type end)
- : m_begin(begin)
- , m_end(end) {}
-
- bool has_next() const {
- return m_begin < m_end;
- }
-
- id_type operator*() const {
- return m_begin;
- }
-
- void operator++() {
- ++m_begin;
- }
-
-private:
- id_type m_begin, m_end;
-};
-
} // namespace autocomplete
diff --git a/archive/install.sh b/archive/install.sh
new file mode 100644
index 0000000..7714147
--- /dev/null
+++ b/archive/install.sh
@@ -0,0 +1,11 @@
+git submodule init
+git submodule update
+mkdir -p build
+cd build
+cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On
+make
+cd ../test_data
+bash preprocess.sh trec_05_efficiency_queries/trec_05_efficiency_queries.completions 300
+cd ../build
+make test
+cd ..
diff --git a/archive/script/benchmark_dictionaries.sh b/archive/script/benchmark_dictionaries.sh
new file mode 100644
index 0000000..29c9a84
--- /dev/null
+++ b/archive/script/benchmark_dictionaries.sh
@@ -0,0 +1,7 @@
+cd ../test_data
+bash preprocess.sh aol/aol.completions 100000
+cd ../build
+python ../script/collect_locate_prefix_results_by_varying_percentage.py fc ../test_data/aol/aol.completions 100000
+python ../script/collect_locate_prefix_results_by_varying_percentage.py trie ../test_data/aol/aol.completions 100000
+./benchmark_fc_dictionary ../test_data/aol/aol.completions 100000 < ../test_data/aol/aol.completions.queries/queries.length=1 > ../test_data/aol/aol.completions.dictionary_benchmark.txt
+cd ../script
\ No newline at end of file
diff --git a/archive/script/build_indexes.py b/archive/script/build_indexes.py
new file mode 100644
index 0000000..e01e1db
--- /dev/null
+++ b/archive/script/build_indexes.py
@@ -0,0 +1,6 @@
+import sys, os
+
+dataset_name = sys.argv[1] # e.g., aol
+types = ["ef_type1", "ef_type2", "ef_type3", "ef_type4"]
+for t in types:
+ os.system("./build " + t + " ../test_data/" + dataset_name + "/" + dataset_name + ".completions -o " + t + "." + dataset_name + ".bin -c 0.0001")
\ No newline at end of file
diff --git a/archive/script/collect_effectiveness_results_by_varying_percentage.py b/archive/script/collect_effectiveness_results_by_varying_percentage.py
new file mode 100644
index 0000000..2693e70
--- /dev/null
+++ b/archive/script/collect_effectiveness_results_by_varying_percentage.py
@@ -0,0 +1,17 @@
+import sys, os
+
+index_type = sys.argv[1]
+index_filename = sys.argv[2]
+collection_basename = sys.argv[3] # e.g., aol/aol.completions or aol/aol.completions.filtered
+k = sys.argv[4]
+num_queries = sys.argv[5]
+
+output_filename = collection_basename + "." + index_type
+output_filename += ".effectiveness.json"
+query_filename_prefix = collection_basename + ".queries/queries."
+
+percentages = ["0.0", "0.25", "0.50", "0.75"]
+for perc in percentages:
+ for terms in range(1,7):
+ os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename)
+ os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " 7+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=7+ 2>> " + output_filename)
diff --git a/archive/script/collect_locate_prefix_results_by_varying_percentage.py b/archive/script/collect_locate_prefix_results_by_varying_percentage.py
new file mode 100644
index 0000000..305fafa
--- /dev/null
+++ b/archive/script/collect_locate_prefix_results_by_varying_percentage.py
@@ -0,0 +1,14 @@
+import sys, os
+
+type = sys.argv[1] # 'trie' or 'fc'
+collection_basename = sys.argv[2]
+num_queries = sys.argv[3]
+
+output_filename = collection_basename + "." + type + ".locate_prefix.json"
+query_filename_prefix = collection_basename + ".queries/queries."
+
+percentages = ["0.0", "0.25", "0.50", "0.75"]
+for perc in percentages:
+ for terms in range(1,8):
+ os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename)
+ os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " 8+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename)
diff --git a/archive/script/collect_results_by_varying_percentage.py b/archive/script/collect_results_by_varying_percentage.py
new file mode 100644
index 0000000..c639032
--- /dev/null
+++ b/archive/script/collect_results_by_varying_percentage.py
@@ -0,0 +1,18 @@
+import sys, os
+
+index_type = sys.argv[1]
+query_mode = sys.argv[2] # topk, prefix_topk, conjunctive_topk
+index_filename = sys.argv[3]
+collection_basename = sys.argv[4] # e.g., aol/aol.completions or aol/aol.completions.filtered
+k = sys.argv[5]
+num_queries = sys.argv[6]
+
+output_filename = collection_basename + "." + index_type
+output_filename += "." + query_mode + ".json"
+query_filename_prefix = collection_basename + ".queries/queries."
+
+percentages = ["0.0", "0.25", "0.50", "0.75"]
+for perc in percentages:
+ for terms in range(1,7):
+ os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename)
+ os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 7+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=7+ 2>> " + output_filename)
diff --git a/src/CMakeLists.txt b/archive/src/CMakeLists.txt
similarity index 70%
rename from src/CMakeLists.txt
rename to archive/src/CMakeLists.txt
index 7b000b1..1c5a82d 100644
--- a/src/CMakeLists.txt
+++ b/archive/src/CMakeLists.txt
@@ -2,3 +2,5 @@ add_executable(build build.cpp)
add_executable(web_server web_server.cpp ../external/mongoose/mongoose.c)
add_executable(output_ds2i_format output_ds2i_format.cpp)
add_executable(statistics statistics.cpp)
+# add_executable(check_topk check_topk.cpp)
+add_executable(map_queries map_queries.cpp)
\ No newline at end of file
diff --git a/src/build.cpp b/archive/src/build.cpp
similarity index 52%
rename from src/build.cpp
rename to archive/src/build.cpp
index 732318f..ba73954 100644
--- a/src/build.cpp
+++ b/archive/src/build.cpp
@@ -2,57 +2,48 @@
#include "types.hpp"
#include "statistics.hpp"
+#include "../external/cmd_line_parser/include/parser.hpp"
using namespace autocomplete;
template
-void build(parameters const& params, char const* output_filename) {
+void build(parameters const& params, std::string const& output_filename) {
Index index(params);
index.print_stats();
- if (output_filename) {
+ if (output_filename != "") {
essentials::logger("saving data structure to disk...");
- essentials::save(index, output_filename);
+ essentials::save(index, output_filename.c_str());
essentials::logger("DONE");
}
}
void build_type4(parameters const& params, const float c,
- char const* output_filename) {
+ std::string const& output_filename) {
ef_autocomplete_type4 index(params, c);
index.print_stats();
- if (output_filename) {
+ if (output_filename != "") {
essentials::logger("saving data structure to disk...");
- essentials::save(index, output_filename);
+ essentials::save(index, output_filename.c_str());
essentials::logger("DONE");
}
}
int main(int argc, char** argv) {
- int mandatory = 2;
- if (argc < mandatory + 1) {
- std::cout << argv[0]
- << " [-o output_filename] [-c c]"
- << std::endl;
- return 1;
- }
-
- std::string type(argv[1]);
+ cmd_line_parser::parser parser(argc, argv);
+ parser.add("type", "Index type.");
+ parser.add("collection_basename", "Collection basename.");
+ parser.add("output_filename", "Output filename.", "-o", false);
+ parser.add(
+ "c",
+ "Value for Bast and Weber's technique: c must be a float in (0,1].",
+ "-c", false);
+ if (!parser.parse()) return 1;
+
+ auto type = parser.get("type");
parameters params;
- params.collection_basename = argv[2];
+ params.collection_basename = parser.get("collection_basename");
params.load();
-
- char const* output_filename = nullptr;
- float c = 0.0;
-
- for (int i = mandatory; i != argc; ++i) {
- if (std::string(argv[i]) == "-o") {
- ++i;
- output_filename = argv[i];
- } else if (std::string(argv[i]) == "-c") {
- ++i;
- c = std::stof(argv[i]);
- }
- }
+ auto output_filename = parser.get("output_filename");
if (type == "ef_type1") {
build(params, output_filename);
@@ -61,10 +52,7 @@ int main(int argc, char** argv) {
} else if (type == "ef_type3") {
build(params, output_filename);
} else if (type == "ef_type4") {
- if (c == 0.0) {
- std::cerr << "c must be greater than 0.0" << std::endl;
- return 1;
- }
+ auto c = parser.get("c");
build_type4(params, c, output_filename);
} else {
return 1;
diff --git a/archive/src/check_topk.cpp b/archive/src/check_topk.cpp
new file mode 100644
index 0000000..cb466a1
--- /dev/null
+++ b/archive/src/check_topk.cpp
@@ -0,0 +1,64 @@
+#include
+
+#include "types.hpp"
+#include "../benchmark/benchmark_common.hpp"
+
+using namespace autocomplete;
+
+template
+void check_topk(char const* binary_filename1, char const* binary_filename2,
+ uint32_t k, uint32_t max_num_queries, float keep) {
+ Index index1;
+ ef_autocomplete_type1 index2;
+ essentials::load(index1, binary_filename1);
+ essentials::load(index2, binary_filename2);
+ std::vector queries;
+ load_queries(queries, max_num_queries, keep, std::cin);
+ for (auto const& query : queries) {
+ size_t n1 = index1.topk(query, k).size();
+ size_t n2 = index2.topk(query, k).size();
+ if (n1 != n2) {
+ std::cout << query << std::endl;
+ }
+ }
+}
+
+int main(int argc, char** argv) {
+ int mandatory = 6;
+ if (argc < mandatory + 1) {
+ std::cout << argv[0]
+ << " "
+ " "
+ " < queries"
+ << std::endl;
+ std::cout << " is a float in [0,1] and specifies how much "
+ "we keep of the last token in a query "
+ << std::endl;
+ return 1;
+ }
+
+ std::string type(argv[1]);
+ uint32_t k = std::atoi(argv[2]);
+ char const* binary_filename1 = argv[3];
+ char const* binary_filename2 = argv[4];
+ uint32_t max_num_queries = std::atoi(argv[5]);
+ float keep = std::atof(argv[6]);
+
+ if (type == "ef_type1") {
+ check_topk(binary_filename1, binary_filename2, k,
+ max_num_queries, keep);
+ } else if (type == "ef_type2") {
+ check_topk(binary_filename1, binary_filename2, k,
+ max_num_queries, keep);
+ } else if (type == "ef_type3") {
+ check_topk(binary_filename1, binary_filename2, k,
+ max_num_queries, keep);
+ } else if (type == "ef_type4") {
+ check_topk(binary_filename1, binary_filename2, k,
+ max_num_queries, keep);
+ } else {
+ return 1;
+ }
+
+ return 0;
+}
\ No newline at end of file
diff --git a/archive/src/map_queries.cpp b/archive/src/map_queries.cpp
new file mode 100644
index 0000000..de43df1
--- /dev/null
+++ b/archive/src/map_queries.cpp
@@ -0,0 +1,53 @@
+#include
+
+#include "types.hpp"
+
+using namespace autocomplete;
+
+template
+completion_type parse(Dictionary const& dict, std::string const& query) {
+ completion_type completion;
+ byte_range_iterator it(string_to_byte_range(query));
+ while (it.has_next()) {
+ byte_range term = it.next();
+ auto term_id = dict.locate(term);
+ assert(term_id > 0);
+ assert(term_id != global::invalid_term_id);
+ completion.push_back(term_id - 1);
+ }
+ return completion;
+}
+
+int main(int argc, char** argv) {
+ int mandatory = 2 + 1;
+ if (argc < mandatory) {
+ std::cout << argv[0] << " < queries"
+ << std::endl;
+ return 1;
+ }
+
+ parameters params;
+ params.collection_basename = argv[1];
+ params.load();
+
+ uint32_t num_queries = std::atoi(argv[2]);
+
+ fc_dictionary_type dict;
+ {
+ fc_dictionary_type::builder builder(params);
+ builder.build(dict);
+ }
+
+ std::string query;
+ for (uint32_t i = 0; i != num_queries; ++i) {
+ if (!std::getline(std::cin, query)) break;
+ auto completion = parse(dict, query);
+ std::cerr << completion.front();
+ for (size_t i = 1; i != completion.size(); ++i) {
+ std::cerr << "\t" << completion[i];
+ }
+ std::cerr << "\n";
+ }
+
+ return 0;
+}
\ No newline at end of file
diff --git a/src/output_ds2i_format.cpp b/archive/src/output_ds2i_format.cpp
similarity index 97%
rename from src/output_ds2i_format.cpp
rename to archive/src/output_ds2i_format.cpp
index cc139c4..eb92509 100644
--- a/src/output_ds2i_format.cpp
+++ b/archive/src/output_ds2i_format.cpp
@@ -27,7 +27,7 @@ int main(int argc, char** argv) {
{ // write ds2i header
uint32_t n = 1;
- uint32_t universe = params.num_completions;
+ uint32_t universe = params.universe;
docs.write(reinterpret_cast(&n), sizeof(uint32_t));
docs.write(reinterpret_cast(&universe), sizeof(uint32_t));
}
diff --git a/src/statistics.cpp b/archive/src/statistics.cpp
similarity index 58%
rename from src/statistics.cpp
rename to archive/src/statistics.cpp
index 5b2148f..9dbf689 100644
--- a/src/statistics.cpp
+++ b/archive/src/statistics.cpp
@@ -2,25 +2,25 @@
#include "types.hpp"
#include "statistics.hpp"
+#include "../external/cmd_line_parser/include/parser.hpp"
using namespace autocomplete;
template
-void print_stats(char const* index_filename) {
+void print_stats(std::string const& index_filename) {
Index index;
- essentials::load(index, index_filename);
+ essentials::load(index, index_filename.c_str());
index.print_stats();
}
int main(int argc, char** argv) {
- int mandatory = 2;
- if (argc < mandatory + 1) {
- std::cout << argv[0] << " " << std::endl;
- return 1;
- }
+ cmd_line_parser::parser parser(argc, argv);
+ parser.add("type", "Index type.");
+ parser.add("index_filename", "Index filename.");
+ if (!parser.parse()) return 1;
- std::string type(argv[1]);
- char const* index_filename = argv[2];
+ auto type = parser.get("type");
+ auto index_filename = parser.get("index_filename");
if (type == "ef_type1") {
print_stats(index_filename);
diff --git a/src/web_server.cpp b/archive/src/web_server.cpp
similarity index 92%
rename from src/web_server.cpp
rename to archive/src/web_server.cpp
index 94a259b..db317fa 100644
--- a/src/web_server.cpp
+++ b/archive/src/web_server.cpp
@@ -5,6 +5,7 @@
#include "constants.hpp"
#include "types.hpp"
+#include "probe.hpp"
#include "../external/mongoose/mongoose.h"
@@ -26,7 +27,7 @@ std::string escape_json(std::string const& s) {
using namespace autocomplete;
-typedef ef_autocomplete_type3 topk_index_type;
+typedef ef_autocomplete_type1 topk_index_type;
static std::string s_http_port("8000");
static struct mg_serve_http_opts s_http_server_opts;
@@ -53,9 +54,10 @@ static void ev_handler(struct mg_connection* nc, int ev, void* p) {
}
std::string data;
- auto it = topk_index.topk(query, k);
- // auto it = topk_index.prefix_topk(query, k);
- // auto it = topk_index.conjunctive_topk(query, k);
+ nop_probe probe;
+ // auto it = topk_index.topk(query, k probe);
+ // auto it = topk_index.prefix_topk(query, k, probe);
+ auto it = topk_index.conjunctive_topk(query, k, probe);
if (it.empty()) {
data = "{\"suggestions\":[\"value\":\"\",\"data\":\"\"]}\n";
} else {
diff --git a/archive/test/test_autocomplete.cpp b/archive/test/test_autocomplete.cpp
new file mode 100644
index 0000000..8fe49cc
--- /dev/null
+++ b/archive/test/test_autocomplete.cpp
@@ -0,0 +1,83 @@
+#include "test_common.hpp"
+
+using namespace autocomplete;
+
+typedef ef_autocomplete_type1 index_type;
+
+TEST_CASE("test autocomplete topk functions") {
+ char const* output_filename = testing::tmp_filename.c_str();
+ parameters params;
+ params.collection_basename = testing::test_filename.c_str();
+ params.load();
+
+ {
+ index_type index(params);
+ essentials::save(index, output_filename);
+ }
+
+ {
+ index_type index;
+ essentials::load(index, output_filename);
+
+ {
+ essentials::logger("testing prefix_topk()...");
+ uint32_t k = 7;
+ std::vector queries = {
+ "a", "10", "african",
+ "air", "commercial", "internet",
+ "paris", "somerset", "the",
+ "the new", "the perfect", "the starting line",
+ "yu gi oh", "for sale", "dave mat",
+ "florence", "florida be", "for s",
+ "for sa", "for sal", "for sale",
+ "ford a", "ford au", "ford m",
+ "ford mu", "for", "fo",
+ "f", "matt", "fl",
+ "florir", "fly", "the starting l",
+ "floridaaa"};
+
+ nop_probe probe;
+ for (auto& query : queries) {
+ auto it = index.prefix_topk(query, k, probe);
+ std::cout << "top-" << it.size() << " completions for '"
+ << query << "':\n";
+ for (uint32_t i = 0; i != it.size(); ++i, ++it) {
+ auto completion = *it;
+ std::cout << "(" << completion.score << ", '";
+ print(completion.string);
+ std::cout << "')" << std::endl;
+ }
+ }
+
+ essentials::logger("DONE");
+ }
+
+ {
+ essentials::logger("testing conjunctive_topk()...");
+ uint32_t k = 7;
+ std::vector queries = {
+ "dave mat", "florence", "florida be", "for s",
+ "for sa", "for sal", "for sale", "ford a",
+ "ford au", "ford m", "ford mu", "for",
+ "fo", "f", "matt", "fl",
+ "flor", "fly", "the starting l"};
+
+ nop_probe probe;
+ for (auto& query : queries) {
+ auto it = index.conjunctive_topk(query, k, probe);
+ std::cout << "top-" << it.size() << " completions for '"
+ << query << "':\n";
+ for (uint32_t i = 0; i != it.size(); ++i, ++it) {
+ auto completion = *it;
+ std::cout << "(" << completion.score << ", '";
+ print(completion.string);
+ std::cout << "')" << std::endl;
+ }
+ }
+
+ essentials::logger("DONE");
+ }
+ }
+
+ std::remove(output_filename);
+}
diff --git a/archive/test/test_blocked_inverted_index.cpp b/archive/test/test_blocked_inverted_index.cpp
new file mode 100644
index 0000000..a2ede74
--- /dev/null
+++ b/archive/test/test_blocked_inverted_index.cpp
@@ -0,0 +1,63 @@
+#include "test_common.hpp"
+
+using namespace autocomplete;
+
+typedef ef_blocked_inverted_index blocked_inverted_index_type;
+typedef ef_inverted_index inverted_index_type;
+
+TEST_CASE("test blocked_inverted_index::intersection_iterator") {
+ parameters params;
+ params.collection_basename = testing::test_filename.c_str();
+ params.load();
+
+ inverted_index_type ii;
+
+ {
+ inverted_index_type::builder ii_builder(params);
+ ii_builder.build(ii);
+ REQUIRE(ii.num_docs() == params.universe);
+ REQUIRE(ii.num_terms() == params.num_terms);
+ }
+
+ {
+ static const uint32_t num_queries = 10000;
+ static const uint32_t max_num_terms = 3;
+ auto queries = testing::gen_random_queries(num_queries, max_num_terms,
+ params.num_terms);
+
+ static const std::vector C = {0.0125, 0.025, 0.05, 0.1};
+ blocked_inverted_index_type blocked_ii;
+ uint64_t total;
+
+ for (auto c : C) {
+ total = 0;
+ {
+ blocked_inverted_index_type::builder blocked_ii_builder(params,
+ c);
+ blocked_ii_builder.build(blocked_ii);
+ }
+
+ REQUIRE(blocked_ii.num_docs() == params.universe);
+ REQUIRE(blocked_ii.num_terms() == params.num_terms);
+
+ for (auto& q : queries) {
+ auto ii_it = ii.intersection_iterator(q);
+ auto blocked_ii_it =
+ blocked_ii.intersection_iterator(q, {0, 0});
+
+ uint32_t n = 0;
+ for (; ii_it.has_next(); ++n, ++ii_it, ++blocked_ii_it) {
+ auto got = *blocked_ii_it;
+ auto expected = *ii_it;
+ REQUIRE_MESSAGE(got == expected, "expected doc_id "
+ << expected
+ << " but got " << got);
+ }
+ if (n) total += n;
+ REQUIRE(blocked_ii_it.has_next() == false);
+ }
+
+ std::cout << total << std::endl;
+ }
+ }
+}
diff --git a/archive/test/test_common.hpp b/archive/test/test_common.hpp
new file mode 100644
index 0000000..c17283f
--- /dev/null
+++ b/archive/test/test_common.hpp
@@ -0,0 +1,88 @@
+#pragma once
+
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include "../external/doctest/doctest/doctest.h"
+
+#include
+
+#include "types.hpp"
+#include "probe.hpp"
+#include "../benchmark/benchmark_common.hpp"
+
+namespace autocomplete {
+namespace testing {
+
+static std::string test_filename(
+ "../test_data/trec_05_efficiency_queries/"
+ "trec_05_efficiency_queries.completions");
+
+static std::string tmp_filename("tmp.bin");
+
+id_type locate(std::vector const& terms, std::string const& t) {
+ return std::distance(terms.begin(),
+ std::lower_bound(terms.begin(), terms.end(), t)) +
+ 1;
+}
+
+range locate_prefix(std::vector const& strings,
+ std::string const& p) {
+ auto comp_l = [](std::string const& l, std::string const& r) {
+ if (l.size() < r.size()) {
+ return strncmp(l.c_str(), r.c_str(), l.size()) <= 0;
+ }
+ return strcmp(l.c_str(), r.c_str()) < 0;
+ };
+
+ auto comp_r = [](std::string const& l, std::string const& r) {
+ if (l.size() < r.size()) {
+ return strncmp(l.c_str(), r.c_str(), l.size()) < 0;
+ }
+ return strcmp(l.c_str(), r.c_str()) < 0;
+ };
+
+ range r;
+ r.begin = std::distance(
+ strings.begin(),
+ std::lower_bound(strings.begin(), strings.end(), p, comp_l));
+ r.end = std::distance(
+ strings.begin(),
+ std::upper_bound(strings.begin(), strings.end(), p, comp_r));
+
+ return r;
+}
+
+typedef std::vector term_ids;
+
+std::vector gen_random_queries(uint32_t num_queries,
+ uint32_t max_num_terms,
+ uint32_t max_range_len) {
+ assert(max_num_terms > 1);
+ std::vector queries;
+ queries.reserve(num_queries);
+ essentials::uniform_int_rng random_num_terms(2, max_num_terms);
+ essentials::uniform_int_rng random_term_id(1, max_range_len);
+
+ for (uint32_t i = 0; i != num_queries; ++i) {
+ term_ids q;
+ uint32_t num_terms = random_num_terms.gen();
+ q.reserve(num_terms);
+ uint32_t num_distinct_terms = 0;
+ while (true) {
+ q.clear();
+ for (uint32_t i = 0; i != num_terms; ++i) {
+ q.push_back(random_term_id.gen());
+ }
+ std::sort(q.begin(), q.end());
+ auto end = std::unique(q.begin(), q.end());
+ num_distinct_terms = std::distance(q.begin(), end);
+ if (num_distinct_terms >= 2) break;
+ }
+ q.resize(num_distinct_terms);
+ queries.push_back(q);
+ }
+
+ return queries;
+}
+
+} // namespace testing
+} // namespace autocomplete
\ No newline at end of file
diff --git a/archive/test/test_compact_forward_index.cpp b/archive/test/test_compact_forward_index.cpp
new file mode 100644
index 0000000..dc78c07
--- /dev/null
+++ b/archive/test/test_compact_forward_index.cpp
@@ -0,0 +1,47 @@
+#include "test_common.hpp"
+
+using namespace autocomplete;
+
+TEST_CASE("test compact_forward_index::iterator") {
+ char const* output_filename = testing::tmp_filename.c_str();
+ parameters params;
+ params.collection_basename = testing::test_filename.c_str();
+ params.load();
+
+ {
+ compact_forward_index::builder builder(params);
+ compact_forward_index index;
+ builder.build(index);
+ REQUIRE(index.num_docs() == params.universe);
+ REQUIRE(index.num_terms() == params.num_terms);
+ essentials::save(index, output_filename);
+ }
+
+ {
+ compact_forward_index index;
+ essentials::load(index, output_filename);
+ REQUIRE(index.num_docs() == params.universe);
+ REQUIRE(index.num_terms() == params.num_terms);
+
+ std::ifstream input((params.collection_basename + ".forward").c_str(),
+ std::ios_base::in);
+ for (uint64_t i = 0; i != index.num_terms(); ++i) {
+ auto it = index.iterator(i);
+ uint32_t n = 0;
+ input >> n;
+ REQUIRE_MESSAGE(n == it.size(), "list has size " << it.size()
+ << " instead of "
+ << n);
+ for (uint64_t k = 0; k != n; ++k, ++it) {
+ id_type expected;
+ input >> expected;
+ auto got = *it;
+ REQUIRE_MESSAGE(got == expected,
+ "got " << got << " but expected " << expected);
+ }
+ }
+ input.close();
+
+ std::remove(output_filename);
+ }
+};
diff --git a/archive/test/test_completion_trie.cpp b/archive/test/test_completion_trie.cpp
new file mode 100644
index 0000000..c5155e1
--- /dev/null
+++ b/archive/test/test_completion_trie.cpp
@@ -0,0 +1,37 @@
+#include "test_common.hpp"
+
+using namespace autocomplete;
+
+typedef ef_completion_trie completion_trie_type;
+
+TEST_CASE("test completion_trie::is_member()") {
+ char const* output_filename = testing::tmp_filename.c_str();
+ parameters params;
+ params.collection_basename = testing::test_filename.c_str();
+ params.load();
+
+ {
+ completion_trie_type::builder builder(params);
+ completion_trie_type ct;
+ builder.build(ct);
+ REQUIRE(ct.size() == params.num_completions);
+ essentials::save(ct, output_filename);
+ }
+
+ {
+ completion_trie_type ct;
+ essentials::load(ct, output_filename);
+ REQUIRE(ct.size() == params.num_completions);
+ std::ifstream input(params.collection_basename + ".mapped",
+ std::ios_base::in);
+ INFO("testing is_member()");
+ completion_iterator it(params, input);
+ while (input) {
+ auto& record = *it;
+ REQUIRE(ct.is_member(record.completion));
+ ++it;
+ }
+ input.close();
+ std::remove(output_filename);
+ }
+}
diff --git a/archive/test/test_fc_dictionary.cpp b/archive/test/test_fc_dictionary.cpp
new file mode 100644
index 0000000..50d12b0
--- /dev/null
+++ b/archive/test/test_fc_dictionary.cpp
@@ -0,0 +1,86 @@
+#include "test_common.hpp"
+
+using namespace autocomplete;
+
+TEST_CASE("test fc_dictionary") {
+ char const* output_filename = testing::tmp_filename.c_str();
+ parameters params;
+ params.collection_basename = testing::test_filename.c_str();
+ params.load();
+
+ {
+ fc_dictionary_type::builder builder(params);
+ fc_dictionary_type dict;
+ builder.build(dict);
+ essentials::save(dict, output_filename);
+ }
+
+ {
+ fc_dictionary_type dict;
+ essentials::load(dict, output_filename);
+
+ // test locate() and extract for all strings
+ std::vector terms;
+ terms.reserve(params.num_terms);
+ std::ifstream input((params.collection_basename + ".dict").c_str(),
+ std::ios_base::in);
+ if (!input.good()) {
+ throw std::runtime_error("File not found");
+ }
+ std::string term;
+ term.reserve(256 + 1);
+ input >> term;
+ while (input) {
+ terms.push_back(std::move(term));
+ input >> term;
+ }
+ input.close();
+
+ std::vector decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY);
+
+ for (auto const& t : terms) {
+ id_type expected = testing::locate(terms, t);
+ id_type got = dict.locate(string_to_byte_range(t));
+
+ REQUIRE_MESSAGE(got == expected, "expected id " << expected
+ << ", but got id "
+ << got);
+
+ uint8_t string_len = dict.extract(got, decoded.data());
+ REQUIRE_MESSAGE(string_len == t.size(),
+ "expected size " << t.size() << ", but got size "
+ << string_len);
+
+ auto s = reinterpret_cast(decoded.data());
+ for (uint8_t i = 0; i != string_len; ++i) {
+ REQUIRE_MESSAGE(t[i] == s[i], "expected char " << t[i]
+ << " but got "
+ << s[i]);
+ }
+ }
+
+ // test locate_prefix() for all strings
+ std::string prefix;
+ prefix.reserve(256 + 1);
+ for (auto const& t : terms) {
+ uint32_t n = t.size();
+ for (uint32_t prefix_len = 1; prefix_len <= n; ++prefix_len) {
+ prefix.clear();
+ for (uint32_t i = 0; i != prefix_len; ++i) {
+ prefix.push_back(t[i]);
+ }
+
+ range expected = testing::locate_prefix(terms, prefix);
+ range got = dict.locate_prefix(string_to_byte_range(prefix));
+ REQUIRE_MESSAGE((got.begin == expected.begin and
+ got.end == expected.end - 1),
+ "Error for prefix '"
+ << prefix << "' : expected ["
+ << expected.begin << "," << expected.end - 1
+ << "] but got [" << got.begin << ","
+ << got.end << "]");
+ }
+ }
+ std::remove(output_filename);
+ }
+}
diff --git a/archive/test/test_integer_fc_dictionary.cpp b/archive/test/test_integer_fc_dictionary.cpp
new file mode 100644
index 0000000..d36db82
--- /dev/null
+++ b/archive/test/test_integer_fc_dictionary.cpp
@@ -0,0 +1,63 @@
+#include "test_common.hpp"
+
+using namespace autocomplete;
+
+TEST_CASE("test integer_fc_dictionary") {
+ char const* output_filename = testing::tmp_filename.c_str();
+ parameters params;
+ params.collection_basename = testing::test_filename.c_str();
+ params.load();
+
+ {
+ integer_fc_dictionary_type::builder builder(params);
+ integer_fc_dictionary_type dict;
+ builder.build(dict);
+ essentials::save(dict, output_filename);
+ }
+
+ {
+ integer_fc_dictionary_type dict;
+ essentials::load(dict, output_filename);
+
+ {
+ std::ifstream input(
+ (params.collection_basename + ".mapped").c_str(),
+ std::ios_base::in);
+ completion_iterator it(params, input);
+
+ completion_type decoded(2 * constants::MAX_NUM_TERMS_PER_QUERY);
+ for (id_type id = 0; id != params.num_completions; ++id, ++it) {
+ auto const& expected = (*it).completion;
+ REQUIRE(expected.size() > 0);
+ uint8_t size = dict.extract(id, decoded);
+
+ REQUIRE_MESSAGE(expected.size() - 1 == size,
+ "Error in decoding the "
+ << id << "-th string: expected size "
+ << expected.size() - 1 << ","
+ << " but got size " << int(size));
+
+ for (uint8_t i = 0; i != size; ++i) {
+ REQUIRE_MESSAGE(decoded[i] == expected[i],
+ "Error in decoding the "
+ << id << "-th string: expected "
+ << expected[i] << ","
+ << " but got " << decoded[i]
+ << " at position " << int(i));
+ }
+
+ id_type got_id =
+ dict.locate({decoded.data(), decoded.data() + size});
+ REQUIRE(got_id != global::invalid_term_id);
+ REQUIRE_MESSAGE(got_id == id, "Error in locating the "
+ << id
+ << "-th string: expected id "
+ << id << ","
+ << " but got id " << got_id);
+ }
+
+ input.close();
+ }
+ std::remove(output_filename);
+ }
+}
diff --git a/archive/test/test_inverted_index.cpp b/archive/test/test_inverted_index.cpp
new file mode 100644
index 0000000..5faa823
--- /dev/null
+++ b/archive/test/test_inverted_index.cpp
@@ -0,0 +1,135 @@
+#include "test_common.hpp"
+
+using namespace autocomplete;
+
+typedef ef_inverted_index inverted_index_type;
+
+TEST_CASE("test inverted_index::iterator") {
+ char const* output_filename = testing::tmp_filename.c_str();
+ parameters params;
+ params.collection_basename = testing::test_filename.c_str();
+ params.load();
+
+ {
+ inverted_index_type::builder builder(params);
+ inverted_index_type index;
+ builder.build(index);
+ REQUIRE(index.num_docs() == params.universe);
+ REQUIRE(index.num_terms() == params.num_terms);
+ essentials::save(index, output_filename);
+ }
+
+ {
+ inverted_index_type index;
+ essentials::load(index, output_filename);
+ REQUIRE(index.num_docs() == params.universe);
+ REQUIRE(index.num_terms() == params.num_terms);
+
+ std::ifstream input((params.collection_basename + ".inverted").c_str(),
+ std::ios_base::in);
+ for (uint64_t i = 0; i != index.num_terms(); ++i) {
+ auto it = index.iterator(i);
+ uint32_t n = 0;
+ input >> n;
+ REQUIRE_MESSAGE(n == it.size(), "list has size " << it.size()
+ << " instead of "
+ << n);
+ for (uint64_t k = 0; k != n; ++k, ++it) {
+ id_type expected;
+ input >> expected;
+ auto got = *it;
+ REQUIRE_MESSAGE(got == expected,
+ "got " << got << " but expected " << expected);
+ }
+ }
+ input.close();
+
+ std::remove(output_filename);
+ }
+};
+
+TEST_CASE("test inverted_index::intersection_iterator") {
+ char const* output_filename = testing::tmp_filename.c_str();
+ parameters params;
+ params.collection_basename = testing::test_filename.c_str();
+ params.load();
+
+ {
+ inverted_index_type::builder builder(params);
+ inverted_index_type index;
+ builder.build(index);
+ REQUIRE(index.num_docs() == params.universe);
+ REQUIRE(index.num_terms() == params.num_terms);
+ essentials::save(index, output_filename);
+ }
+
+ {
+ inverted_index_type index;
+ essentials::load(index, output_filename);
+ REQUIRE(index.num_docs() == params.universe);
+ REQUIRE(index.num_terms() == params.num_terms);
+
+ static const uint32_t num_queries = 1000000;
+ static const uint32_t max_num_terms = 5;
+ auto queries = testing::gen_random_queries(num_queries, max_num_terms,
+ index.num_terms());
+
+ std::vector first(index.num_docs());
+ std::vector second(index.num_docs());
+ std::vector intersection(index.num_docs());
+
+ for (auto const& q : queries) {
+ uint32_t first_size = 0;
+ uint32_t second_size = 0;
+ assert(q.size() >= 2);
+
+ {
+ auto it = index.iterator(q[0] - 1);
+ first_size = it.size();
+ for (uint32_t i = 0; i != first_size; ++i) {
+ first[i] = it.access(i);
+ }
+ }
+
+ {
+ auto it = index.iterator(q[1] - 1);
+ second_size = it.size();
+ for (uint32_t i = 0; i != second_size; ++i) {
+ second[i] = it.access(i);
+ }
+ }
+
+ auto end = std::set_intersection(
+ first.begin(), first.begin() + first_size, second.begin(),
+ second.begin() + second_size, intersection.begin());
+ first_size = std::distance(intersection.begin(), end);
+ first.swap(intersection);
+
+ for (uint32_t i = 2; i != q.size(); ++i) {
+ auto it = index.iterator(q[i] - 1);
+ second_size = it.size();
+ for (uint32_t i = 0; i != second_size; ++i) {
+ second[i] = it.access(i);
+ }
+ end = std::set_intersection(
+ first.begin(), first.begin() + first_size, second.begin(),
+ second.begin() + second_size, intersection.begin());
+ first_size = std::distance(intersection.begin(), end);
+ first.swap(intersection);
+ }
+
+ auto it = index.intersection_iterator(q);
+ uint32_t n = 0;
+ for (; it.has_next(); ++n, ++it) {
+ auto doc_id = *it;
+ REQUIRE_MESSAGE(
+ doc_id == first[n],
+ "expected doc_id " << first[n] << " but got " << doc_id);
+ }
+ REQUIRE_MESSAGE(n == first_size, "expected " << first_size
+ << " results, but got "
+ << n);
+ }
+ std::remove(output_filename);
+ }
+}
diff --git a/archive/test/test_locate_prefix.cpp b/archive/test/test_locate_prefix.cpp
new file mode 100644
index 0000000..1a81693
--- /dev/null
+++ b/archive/test/test_locate_prefix.cpp
@@ -0,0 +1,102 @@
+#include "test_common.hpp"
+
+using namespace autocomplete;
+
+typedef ef_completion_trie completion_trie_type;
+
+template
+void test_locate_prefix(Dictionary const& dict, Index const& index,
+ std::vector const& queries,
+ std::vector const& strings) {
+ for (auto const& query : queries) {
+ range expected = testing::locate_prefix(strings, query);
+ completion_type prefix;
+ byte_range suffix;
+ parse(dict, query, prefix, suffix, true);
+
+ range suffix_lex_range = dict.locate_prefix(suffix);
+ suffix_lex_range.begin += 1;
+ suffix_lex_range.end += 1;
+ range got = index.locate_prefix(prefix, suffix_lex_range);
+
+ CHECK_MESSAGE((got.begin == expected.begin and got.end == expected.end),
+ "Error for query '"
+ << query << "': expected [" << expected.begin << ","
+ << expected.end << ") but got [" << got.begin << ","
+ << got.end << ")");
+ }
+}
+
+TEST_CASE("test locate_prefix()") {
+ parameters params;
+ params.collection_basename = testing::test_filename.c_str();
+ params.load();
+
+ fc_dictionary_type dict;
+ {
+ fc_dictionary_type::builder builder(params);
+ builder.build(dict);
+ }
+
+ std::vector strings;
+
+ {
+ essentials::logger("loading all strings...");
+ std::string line;
+ strings.reserve(params.num_completions);
+ std::ifstream input((params.collection_basename).c_str(),
+ std::ios_base::in);
+ for (uint32_t i = 0; i != params.num_completions; ++i) {
+ if (!std::getline(input, line)) break;
+ auto s = line.substr(line.find(' ') + 1, line.size());
+ strings.push_back(s);
+ }
+ input.close();
+ essentials::logger("loaded " + std::to_string(strings.size()) +
+ " strings");
+ }
+
+ constexpr uint32_t max_num_queries = 5000;
+ std::vector queries;
+ static std::vector percentages = {0.0, 0.25, 0.50, 0.75, 1.0};
+ static std::vector query_terms = {1, 2, 3, 4, 5, 6, 7};
+
+ completion_trie_type ct_index;
+ integer_fc_dictionary_type fc_index;
+
+ {
+ completion_trie_type::builder builder(params);
+ builder.build(ct_index);
+ REQUIRE(ct_index.size() == params.num_completions);
+ }
+
+ {
+ integer_fc_dictionary_type::builder builder(params);
+ builder.build(fc_index);
+ REQUIRE(fc_index.size() == params.num_completions);
+ }
+
+ for (auto perc : percentages) {
+ for (auto num_terms : query_terms) {
+ std::cout << "percentage " << perc * 100.0 << "%, num_terms "
+ << num_terms << std::endl;
+ {
+ queries.clear();
+ std::string filename =
+ params.collection_basename +
+ ".queries/queries.length=" + std::to_string(num_terms);
+ std::ifstream querylog(filename.c_str());
+ if (!querylog.is_open()) {
+ std::cerr << "cannot open file '" << filename << "'"
+ << std::endl;
+ return;
+ }
+ load_queries(queries, max_num_queries, perc, querylog);
+ querylog.close();
+ }
+
+ test_locate_prefix(dict, ct_index, queries, strings);
+ test_locate_prefix(dict, fc_index, queries, strings);
+ }
+ }
+}
diff --git a/archive/test/test_unsorted_list.cpp b/archive/test/test_unsorted_list.cpp
new file mode 100644
index 0000000..2760532
--- /dev/null
+++ b/archive/test/test_unsorted_list.cpp
@@ -0,0 +1,172 @@
+#include "test_common.hpp"
+
+using namespace autocomplete;
+
+uint32_t naive_topk(std::vector const& input, range r, uint32_t k,
+ std::vector& topk, bool unique = false) {
+ uint32_t range_len = r.end - r.begin;
+ for (uint32_t i = 0; i != range_len; ++i) {
+ topk[i] = input[r.begin + i];
+ }
+ std::sort(topk.begin(), topk.begin() + range_len);
+ uint32_t results = 0;
+ if (unique) {
+ auto end = std::unique(topk.begin(), topk.begin() + range_len);
+ results = std::min(k, std::distance(topk.begin(), end));
+ } else {
+ results = std::min(k, range_len);
+ }
+ return results;
+}
+
+std::vector gen_random_queries(uint32_t num_queries,
+ uint32_t max_range_len) {
+ std::vector