diff --git a/.gitignore b/.gitignore index 3094469..b884c82 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,113 @@ +# Rust specific +/target/ +**/target/ +**/*.rs.bk +Cargo.lock +*.pdb + +# Protocol Buffers +*.pb.h +*.pb.cc +*.pb.go +*.pb.swift +*.pb.dart +*.pb.js +*.pb.ts +*.pb.rs + +# Generated Rust files +/src/autocomplete_proto.rs +/src/autocomplete_proto/*.rs + +# C++ specific +*.o +*.obj +*.exe +*.out +*.app +*.dll +*.so +*.dylib +*.a +*.lib +*.d +*.lo +*.la +*.lai +*.Plo +*.Pla +*.l +*.o +*.obj +*.elf +*.bin +*.hex +*.map +*.lst +*.sym +*.lss +*.eep +*.elf +*.hex +*.bin +*.map +*.lst +*.sym +*.lss +*.eep +*.elf +*.hex +*.bin +*.map +*.lst +*.sym +*.lss +*.eep + +# Build directories +/build/ +**/build/ +/debug_build/ +**/debug_build/ +/CMakeFiles/ +**/CMakeFiles/ +/CMakeCache.txt +**/CMakeCache.txt +/CMakeScripts/ +**/CMakeScripts/ +/Testing/ +**/Testing/ +/Makefile +**/Makefile +/cmake_install.cmake +**/cmake_install.cmake +/install_manifest.txt +**/install_manifest.txt +/compile_commands.json +**/compile_commands.json +/CTestTestfile.cmake +**/CTestTestfile.cmake +/_deps +**/_deps +/.cmake +**/.cmake + +# IDE specific +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS specific .DS_Store -build +Thumbs.db + +# Project specific +*.mapped +*.mapped.stats +*.dict +*.inverted +*.forward +*.bin +target/ diff --git a/.gitmodules b/.gitmodules index 72f21cd..5b9dc7e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,9 @@ [submodule "external/mongoose"] path = external/mongoose url = https://github.com/cesanta/mongoose.git +[submodule "external/doctest"] + path = external/doctest + url = https://github.com/onqtam/doctest.git +[submodule "external/cmd_line_parser"] + path = external/cmd_line_parser + url = https://github.com/jermp/cmd_line_parser.git diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..35abc20 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright 2019 Giulio Ermanno Pibiri + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md index b1a0946..624670f 100644 --- a/README.md +++ b/README.md @@ -1,177 +1,238 @@ -Autocomplete ------------- +# Autocomplete System -Query autocompletion in C++. +This repository contains an autocomplete system implementation. The original C++ implementation is being ported to Rust and will be containerized for easier deployment and testing. -##### Table of contents -1. [Description](#descr) -2. [Compiling the code](#compiling) -3. [Input data format](#input) -4. [Benchmarks](#benchmarks) -5. [Live demo](#demo) +## Project Structure -Description ------------ +- `autocomplete-rs/`: The Rust port of the original C++ implementation +- `archive/`: Original C++ implementation and related files + +## Goals + +1. Port the C++ implementation to Rust while maintaining the same functionality +2. Leverage Rust's safety guarantees and modern tooling +3. Containerize the application using Docker for easy deployment and testing + +## Current Status + +The porting process is ongoing. The following components have been ported to Rust: + +- Basic constants and configuration +- Parameters management +- Performance measurement probes + +## Building and Testing -We designed two solutions (`autocomplete.hpp` and `autocomplete2.hpp`). -The second solution avoids storing the forward index of the first solution. +### Original C++ Implementation +```bash +cd archive +make +``` -Both solution build on two steps: (1) a prefix search (`prefix_topk`) and (2) a conjunctive search (`conjunctive_topk`). +### Rust Implementation +```bash +cd autocomplete-rs +cargo build +cargo test +``` -Recall that each completion has an associated integer identifier (henceforth, called docID), assigned in *decreasing* score order. +## License -#### 1. Prefix search +This project is licensed under the MIT License - see the LICENSE file for details. -This step returns the top-k completions that are prefixed by the terms in the query. -For this purposes, we build a dictionary storing all completions seen as (multi-) sets of termIDs. -Solution 1 uses an integer trie data structure (`completion_trie.hpp`); -Solution 2 uses Front Coding (`integer_fc_dictionary.hpp`). -We also materialize the list L of docIDs sorted by the lexicographical order of the completions (`unsorted_list.hpp`). +Autocomplete +------------ + +A Query Auto-Completion system based on the paper *[Efficient and Effective Query Auto-Completion](https://dl.acm.org/doi/10.1145/3397271.3401432)*, by Simon Gog, Giulio Ermanno Pibiri, and Rossano Venturini, +published in ACM SIGIR 2020. + +Please, cite the paper if you use the data structures from this library. + +##### Table of contents +1. [Installation and quick start](#install) +2. [Compiling the code](#compiling) +3. [Input data format](#input) +4. [Running the unit tests](#testing) +5. [Building an index](#building) +6. [Benchmarks](#benchmarks) +7. [Live demo](#demo) + +Installation and quick start +------------------ -During a search, we first map the query terms to their lexicographic IDs by using a string dictionary (implemented as a 2-level index with Front Coding -- `fc_dictionary.hpp`). Then, we search the mapped query, say Q, into the completion trie to obtain the lexicographic range [l,r] of all completions that are children of Q. Then we need to identify the top-k docIDs from L[l,r]. Since the range [l,r] can be very large, we use a RMQ data structure built on L. +Just run -Having retrieved a list of (at most) k docIDs, we then: + bash ./install.sh -1. Solution 1: use a forward index (`forward_index.hpp`) to materialize the identified completions into a string pool (`scored_string_pool.hpp`). -The forward index stores the sorted (multi-) set of the termIDs of each completion, plus also the permutation of such termIDs in order to restore the original completion. The sets are stored in increasing-docID order. -Specifically, we use the forward index to obtain the (permuted) set -of termIDs and the string dictionary to extract the strings. +from the parent directory. The script builds the code; prepare the test data in the folder `test_data/trec_05_efficiency_queries` for indexing; executes the unit tests. -2. Solution 2: use a map from docIDs to lexicographic IDs. For every top-k docID, we extract the corresponding completion from the FC-based dictionary. +After that, for having a minimal running example, just run -#### 2. Conjunctive search + bash ./example.sh -This step returns the top-k completions using an inverted index (`inverted_index.hpp`). -For this purpose, let us consider a query Q[1,m] as tokenized into m terms (the last one possibly not completed). -In this case we want to return the top-k (smallest) docIDs belonging -to the intersection between the posting lists of the first m-1 terms -and the union between all the postings lists of the terms that are -prefixed by Q[m]. +and then access the service [from localhost](http://localhost:8000). -To do so, we could trivially materialize the union and then proceed -with the intersection. -The clear problem with this approach is that the number of terms that are prefixed by Q[m] can be very large. Therefore iterating over the union can be overkilling. +### Or you can use a prebuilt Docker image -To solve this problem, we first obtain the lexicographic range of Q[m] by the string dictionary, say [l,r]. -We then iterate over the intersection of the first m-1 terms' posting lists and for each docID x we check whether the range [l,r] intersect the forward list of x. This check is done with the forward index. -If the check succeeds, then x is among the top-k documents. -We keep iterating over the intersection and checking the forward lists until we have k completions or we touch every docID in the intersection. +The following command pulls a prebuilt Docker image and runs it locally. -There is a special case for the case m = 1. In this case, we have no term before the last (only) one, thus we would check *all* forward lists for the range [l,r]. This is too expensive. -Therefore, we use another RMQ data structure, built on the list, say M, of all the first (i.e., *minimal*) docIDs of the posting lists (think of it as the "first" column of the inverted index). -A recursive heap-based algorithm is used to produce the smallest docIDs in M[l,r] using the RMQ data structure. + docker pull jermp/autocomplete + docker run -p 8000:8000 -d jermp/autocomplete -The final string extraction step is identical to that of the -prefix search. +And then access the service [from localhost](http://localhost:8000). Compiling the code ------------------ -The code is tested on Linux with `gcc` 7.4.0 and on Mac 10.14 with `clang` 10.0.0. +The code has been tested on Linux with `gcc` 7.4.0, 8.3.0, 9.0.0, on Mac OS 10.14 and 12.4 with `clang` 10.0.0 and 13.0.0. + To build the code, [`CMake`](https://cmake.org/) is required. Clone the repository with - $ git clone --recursive https://github.com/jermp/autocomplete.git + git clone --recursive https://github.com/jermp/autocomplete.git If you have cloned the repository without `--recursive`, you will need to perform the following commands before compiling: - $ git submodule init - $ git submodule update + git submodule init + git submodule update To compile the code for a release environment (see file `CMakeLists.txt` for the used compilation flags), it is sufficient to do the following: - $ mkdir build - $ cd build - $ cmake .. - $ make + mkdir build + cd build + cmake .. + make -Hint: Use `make -j4` to compile the library in parallel using, e.g., 4 jobs. +Hint: Use `make -j` to compile the library in parallel using all +available threads. For the best of performance, we recommend compiling with: - $ `cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On` + cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On For a testing environment, use the following instead: - $ mkdir debug_build - $ cd debug_build - $ cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_SANITIZERS=On - $ make - + mkdir debug_build + cd debug_build + cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_SANITIZERS=On + make + Input data format ----------------- The input file should list all completions in *lexicographical* order. -For example, see the the file `test_data/trec05_efficiency_queries/trec05_efficiency_queries.completions`. +For example, see the the file `test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions`. The first column represent the ID of the completion; the other columns contain the tokens separated by white spaces. -(The IDs for the file `trec05_efficiency_queries.completions` are +(The IDs for the file `trec_05_efficiency_queries.completions` are fake, i.e., they do not take into account any particular assignment.) -The scripts in the directory `test_data` help in -preparing the datasets for indexing: +The script `preprocess.sh` in the directory `test_data` helps +in preparing the data for indexing. +Thus, from within the directory `test_data`, it is sufficient +to do: -1. The command - - $ extract_dict.py trec05_efficiency_queries/trec05_efficiency_queries.completions - - extract the dictionary -from a file listing all completions in textual form. + bash preprocess.sh -2. The command +Therefore, for our example with `trec_05_efficiency_queries`, it would be: - $ python map_dataset.py trec05_efficiency_queries/trec05_efficiency_queries.completions - - maps strings to integer ids. + bash preprocess.sh trec_05_efficiency_queries/trec_05_efficiency_queries.completions 300 -3. The command +The second argument in the example, i.e., 300, represents the +number of completions (per completion size) that are drawn at +random and could be used to query the indexes. - $ python build_stats.py trec05_efficiency_queries/trec05_efficiency_queries.completions.mapped - - calulcates the dataset statistics. +If you run the script, you will get: -4. The command - - $ python build_inverted_and_forward.py trec05_efficiency_queries/trec05_efficiency_queries.completions - - builds the inverted and forward files. - -If you run the scripts in the reported order, you will get: - -- `trec05_efficiency_queries.completions.dict`: lists all the distinct +- `trec_05_efficiency_queries.completions.dict`: lists all the distinct tokens in the completions sorted in lexicographical order. -- `trec05_efficiency_queries.completions.mapped`: lists all completions +- `trec_05_efficiency_queries.completions.mapped`: lists all completions whose tokens have been mapped to integer ids as assigned by a lexicographically-sorted string dictionary (that should be built from the -tokens listed in `trec05_efficiency_queries.completions.dict`). +tokens listed in `trec_05_efficiency_queries.completions.dict`). Each completion terminates with the id `0`. -- `trec05_efficiency_queries.completions.mapped.stats` contains some +- `trec_05_efficiency_queries.completions.mapped.stats` contains some statistics about the datasets, needed to build the data structures more efficiently. - `trec05_efficiency_queries.completions.inverted` is the inverted file. -- `trec05_efficiency_queries.completions.forward` is the forward file. Note that each list is *not* sorted, thus the lists are the same as the ones contained in `trec05_efficiency_queries.completions.mapped` but sorted in docID order. +- `trec_05_efficiency_queries.completions.forward` is the forward file. Note that each list is *not* sorted, thus the lists are the same as the ones contained in `trec_05_efficiency_queries.completions.mapped` but sorted in docID order. + +Running the unit tests +----------- + +The unit tests are written using [doctest](https://github.com/onqtam/doctest). + +After compilation and preparation of the data for indexing (see Section [Input data format](#input)), it is advised +to run the unit tests with: + + make test + +Building an index +----------- + +After compiling the code, run the program `./build` to build an index. You can specify the type of the index and the name of the file +where the index will be written. + +For example, with + + ./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec05.ef_type1.bin + +we can build an index of type `ef_type1` from the test file `../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions`, that will be serialized to the file `trec05.ef_type1.bin`. + +Possible types are `ef_type1`, `ef_type2`, `ef_type3` and `ef_type4`. + +Note: the type `ef_type4` requires an extra parameter +to be specified, `c`. Use for example: `-c 0.0001`. Benchmarks ---------- -Run `benchmark/benchmark_prefix_topk` and `benchmark/benchmark_conjunctive_topk`. +To run the top-k benchmarks in the `/benchmark` directory, +we first need some query logs. +They should have been created already if you have run the +script `preprocess.sh`, otherwise +you can use + + python3 partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries 300 + +to partition the input completions by number of query terms +and retain 300 queries at random. +Query files are placed in the output directory +`trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries`. +(By default, 7 shards will be created: the ones having [1,6] query terms and +the one collecting all completions with *at least* 7 query terms). + +Then the command + + ./benchmark_topk ef_type1 10 trec05.ef_type1.bin 3 300 0.25 < ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries/queries.length=3.shuffled + +will execute 1000 top-10 queries with 3 terms, from which only 25% +of the prefix of the last token is retained. + +We automated the collection of results with the script `script/collected_topk_results_by_varying_percentage.py`. +From within the `/build` directory, run + + python3 ../script/collect_results_by_varying_percentage.py ef_type1 topk trec05.ef_type1.bin ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions 10 300 + +To benchmark the dictionaries (Front-Coding and trie), just run the following script from within +the `script` directory: -See the directory `results` for the results on the AOL and MSN query log. + bash benchmark_dictionaries.sh Live demo ---------- Start the web server with the program `./web_server ` and access the demo at -`localhost:`. \ No newline at end of file +`localhost:`. diff --git a/TODO.md b/TODO.md deleted file mode 100644 index 082ced9..0000000 --- a/TODO.md +++ /dev/null @@ -1,2 +0,0 @@ - -- Study the effect of compression. diff --git a/archive/.github/workflows/continuous_integration.yml b/archive/.github/workflows/continuous_integration.yml new file mode 100644 index 0000000..bf625be --- /dev/null +++ b/archive/.github/workflows/continuous_integration.yml @@ -0,0 +1,61 @@ +name: Continuous Integration + +on: + [ push,pull_request ] + +jobs: + build: + name: Continuous Integration + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ ubuntu-latest ] + steps: + + - name: Checkout code + uses: actions/checkout@v2 + + - name: Checkout submodules + run: git submodule update --init --recursive + + - name: Check cmake version + run: cmake --version + + - name: Creating build directory + run: cmake -E make_directory ./build + + - name: Precompilation + working-directory: ./build + run: cmake .. -DCMAKE_BUILD_TYPE=Release + + - name: Compilation + working-directory: ./build + run: cmake --build . --config Release + + - name: Setup python + uses: actions/setup-python@v3 + with: + python-version: '3.x' + architecture: 'x64' + + - name: Data preprocessing + working-directory: ./test_data + run: bash preprocess.sh trec_05_efficiency_queries/trec_05_efficiency_queries.completions 300 + + - name: Testing + working-directory: ./build + run: ctest + + - name: Build binary dictionary + working-directory: build + run: chmod +x build && ./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec_05.ef_type1.bin + + - name: Building docker image + run: docker build -t ${{ secrets.DOCKERHUB_USERNAME }}/autocomplete:latest . + + - name: Dockerhub Authentication + run: docker login --username ${{ secrets.DOCKERHUB_USERNAME }} --password ${{ secrets.DOCKERHUB_ACCESS_TOKEN }} + + - name: Publishing image to Container Registry + if: github.ref == 'refs/heads/master' + run: docker push ${{ secrets.DOCKERHUB_USERNAME }}/autocomplete:latest diff --git a/CMakeLists.txt b/archive/CMakeLists.txt similarity index 79% rename from CMakeLists.txt rename to archive/CMakeLists.txt index 4c90e49..9b3c162 100644 --- a/CMakeLists.txt +++ b/archive/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 2.8) +cmake_minimum_required(VERSION 3.5) project(AUTOCOMPLETE) if(CMAKE_BUILD_TYPE MATCHES Debug) @@ -21,7 +21,7 @@ endif () if(UNIX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") @@ -50,4 +50,11 @@ include_directories(${AUTOCOMPLETE_SOURCE_DIR}/include) add_subdirectory(external) add_subdirectory(src) add_subdirectory(benchmark) -add_subdirectory(test) \ No newline at end of file + +enable_testing() +file(GLOB TEST_SOURCES test/test_*.cpp) +foreach(TEST_SRC ${TEST_SOURCES}) + get_filename_component (TEST_SRC_NAME ${TEST_SRC} NAME_WE) # without extension + add_executable(${TEST_SRC_NAME} ${TEST_SRC}) + add_test(${TEST_SRC_NAME} ${TEST_SRC_NAME}) +endforeach(TEST_SRC) diff --git a/archive/Dockerfile b/archive/Dockerfile new file mode 100644 index 0000000..f29c164 --- /dev/null +++ b/archive/Dockerfile @@ -0,0 +1,25 @@ +FROM ubuntu:latest + +EXPOSE 8000 + +RUN groupadd appgroup && useradd appuser -G appgroup + +COPY . /src + +WORKDIR /app + +RUN apt update && apt install -y cmake g++ python3 + +RUN cmake /src && cmake --build . + +RUN chmod +x web_server && chmod +x build + +RUN ./build ef_type1 /src/test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec_05.ef_type1.bin + +RUN apt purge -y cmake g++ python3 + +RUN rm -rf /src + +USER appuser + +CMD ["./web_server", "8000", "trec_05.ef_type1.bin"] diff --git a/benchmark/CMakeLists.txt b/archive/benchmark/CMakeLists.txt similarity index 59% rename from benchmark/CMakeLists.txt rename to archive/benchmark/CMakeLists.txt index cf8359f..8f2c632 100644 --- a/benchmark/CMakeLists.txt +++ b/archive/benchmark/CMakeLists.txt @@ -1,5 +1,7 @@ -add_executable(benchmark_topk benchmark_topk.cpp) +# add_executable(benchmark_topk benchmark_topk.cpp) add_executable(benchmark_prefix_topk benchmark_prefix_topk.cpp) add_executable(benchmark_conjunctive_topk benchmark_conjunctive_topk.cpp) add_executable(benchmark_fc_dictionary benchmark_fc_dictionary.cpp) -add_executable(benchmark_integer_fc_dictionary benchmark_integer_fc_dictionary.cpp) \ No newline at end of file +add_executable(benchmark_integer_fc_dictionary benchmark_integer_fc_dictionary.cpp) +add_executable(benchmark_locate_prefix benchmark_locate_prefix.cpp) +add_executable(effectiveness effectiveness.cpp) \ No newline at end of file diff --git a/archive/benchmark/benchmark_common.hpp b/archive/benchmark/benchmark_common.hpp new file mode 100644 index 0000000..1a96333 --- /dev/null +++ b/archive/benchmark/benchmark_common.hpp @@ -0,0 +1,131 @@ +#pragma once + +#include "../external/cmd_line_parser/include/parser.hpp" +#include "probe.hpp" + +namespace autocomplete { + +namespace benchmarking { +static const uint32_t runs = 5; +} + +// void tolower(std::string& str) { +// std::transform(str.begin(), str.end(), str.begin(), +// [](unsigned char c) { return std::tolower(c); }); +// } + +size_t load_queries(std::vector& queries, uint32_t max_num_queries, + float percentage, std::istream& is = std::cin) { + assert(percentage >= 0.0 and percentage <= 1.0); + std::string query; + queries.reserve(max_num_queries); + for (uint32_t i = 0; i != max_num_queries; ++i) { + if (!std::getline(is, query)) break; + assert(query.size() > 0); + size_t size = query.size() - 1; + while (size > 0 and query[size] != ' ') --size; + size_t last_token_size = query.size() - size; + size_t end = size + std::ceil(last_token_size * percentage) + 1 + + 1; // retain at least one char + for (size = query.size(); size > end; --size) query.pop_back(); + // tolower(query); + queries.push_back(query); + } + return queries.size(); +} + +void configure_parser_for_benchmarking(cmd_line_parser::parser& parser) { + parser.add("type", "Index type."); + parser.add("k", "top-k value."); + parser.add("index_filename", "Index filename."); + parser.add("num_terms_per_query", "Number of terms per query."); + parser.add("max_num_queries", "Maximum number of queries to execute."); + parser.add("percentage", + "A float in [0,1] specifying how much we keep of the last token " + "in a query: n x 100 <=> n%, for n in [0,1]."); +} + +#define BENCHMARK(what) \ + template \ + void benchmark(std::string const& index_filename, uint32_t k, \ + uint32_t max_num_queries, float keep, \ + essentials::json_lines& breakdowns) { \ + Index index; \ + essentials::load(index, index_filename.c_str()); \ + \ + std::vector queries; \ + uint32_t num_queries = \ + load_queries(queries, max_num_queries, keep, std::cin); \ + \ + uint64_t reported_strings = 0; \ + auto musec_per_query = [&](double time) { \ + return time / (benchmarking::runs * num_queries); \ + }; \ + \ + breakdowns.add("num_queries", std::to_string(num_queries)); \ + \ + timer_probe probe(3); \ + for (uint32_t run = 0; run != benchmarking::runs; ++run) { \ + for (auto const& query : queries) { \ + auto it = index.what##topk(query, k, probe); \ + reported_strings += it.size(); \ + } \ + } \ + std::cout << "#ignore: " << reported_strings << std::endl; \ + \ + breakdowns.add("reported_strings", \ + std::to_string(reported_strings / benchmarking::runs)); \ + breakdowns.add( \ + "parsing_musec_per_query", \ + std::to_string(musec_per_query(probe.get(0).elapsed()))); \ + breakdowns.add( \ + std::string(#what) + "search_musec_per_query", \ + std::to_string(musec_per_query(probe.get(1).elapsed()))); \ + breakdowns.add( \ + "reporting_musec_per_query", \ + std::to_string(musec_per_query(probe.get(2).elapsed()))); \ + breakdowns.add( \ + "total_musec_per_query", \ + std::to_string(musec_per_query(probe.get(0).elapsed()) + \ + musec_per_query(probe.get(1).elapsed()) + \ + musec_per_query(probe.get(2).elapsed()))); \ + } \ + \ + int main(int argc, char** argv) { \ + cmd_line_parser::parser parser(argc, argv); \ + configure_parser_for_benchmarking(parser); \ + if (!parser.parse()) return 1; \ + \ + auto type = parser.get("type"); \ + auto k = parser.get("k"); \ + auto index_filename = parser.get("index_filename"); \ + auto max_num_queries = parser.get("max_num_queries"); \ + auto keep = parser.get("percentage"); \ + \ + essentials::json_lines breakdowns; \ + breakdowns.new_line(); \ + breakdowns.add("num_terms_per_query", \ + parser.get("num_terms_per_query")); \ + breakdowns.add("percentage", std::to_string(keep)); \ + \ + if (type == "ef_type1") { \ + benchmark( \ + index_filename, k, max_num_queries, keep, breakdowns); \ + } else if (type == "ef_type2") { \ + benchmark( \ + index_filename, k, max_num_queries, keep, breakdowns); \ + } else if (type == "ef_type3") { \ + benchmark( \ + index_filename, k, max_num_queries, keep, breakdowns); \ + } else if (type == "ef_type4") { \ + benchmark( \ + index_filename, k, max_num_queries, keep, breakdowns); \ + } else { \ + return 1; \ + } \ + \ + breakdowns.print(); \ + return 0; \ + } + +} // namespace autocomplete \ No newline at end of file diff --git a/archive/benchmark/benchmark_conjunctive_topk.cpp b/archive/benchmark/benchmark_conjunctive_topk.cpp new file mode 100644 index 0000000..df14c84 --- /dev/null +++ b/archive/benchmark/benchmark_conjunctive_topk.cpp @@ -0,0 +1,7 @@ +#include + +#include "types.hpp" +#include "benchmark_common.hpp" + +using namespace autocomplete; +BENCHMARK(conjunctive_) \ No newline at end of file diff --git a/benchmark/benchmark_fc_dictionary.cpp b/archive/benchmark/benchmark_fc_dictionary.cpp similarity index 52% rename from benchmark/benchmark_fc_dictionary.cpp rename to archive/benchmark/benchmark_fc_dictionary.cpp index f566edd..d3e66b5 100644 --- a/benchmark/benchmark_fc_dictionary.cpp +++ b/archive/benchmark/benchmark_fc_dictionary.cpp @@ -8,10 +8,10 @@ using namespace autocomplete; template void perf_test(Dictionary const& dict, std::vector const& queries) { - std::vector decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY); + static std::vector decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY); essentials::timer_type timer; - for (uint32_t i = 0; i != runs; ++i) { + for (uint32_t i = 0; i != benchmarking::runs; ++i) { timer.start(); for (auto const& query : queries) { id_type id = dict.locate(string_to_byte_range(query)); @@ -20,8 +20,8 @@ void perf_test(Dictionary const& dict, timer.stop(); } - std::cout << "locate: " << (timer.average() * 1000.0) / queries.size() - << " [ns/string]" << std::endl; + std::cout << "locate: " << timer.average() / queries.size() + << " [musec/string]" << std::endl; std::vector ids; ids.reserve(queries.size()); @@ -32,7 +32,7 @@ void perf_test(Dictionary const& dict, timer.reset(); - for (uint32_t i = 0; i != runs; ++i) { + for (uint32_t i = 0; i != benchmarking::runs; ++i) { timer.start(); for (auto const& id : ids) { uint8_t string_len = dict.extract(id, decoded.data()); @@ -41,8 +41,30 @@ void perf_test(Dictionary const& dict, timer.stop(); } - std::cout << "extract: " << (timer.average() * 1000.0) / ids.size() - << " [ns/string]" << std::endl; + std::cout << "extract: " << timer.average() / ids.size() + << " [musec/string]" << std::endl; + + static std::vector percentages = {0.0, 0.25, 0.50, 0.75, 1.0}; + for (auto p : percentages) { + timer.reset(); + for (uint32_t i = 0; i != benchmarking::runs; ++i) { + timer.start(); + for (auto const& query : queries) { + size_t size = query.size(); + size_t n = size * p; + if (n == 0) n += 1; // at least one char + uint8_t const* addr = + reinterpret_cast(query.data()); + range r = dict.locate_prefix({addr, addr + n}); + essentials::do_not_optimize_away(r.end - r.begin); + } + timer.stop(); + } + + std::cout << "\tlocate_prefix-" << p * 100.0 + << "%: " << timer.average() / queries.size() + << " [musec/string]" << std::endl; + } } #define exe(BUCKET_SIZE) \ @@ -57,30 +79,29 @@ void perf_test(Dictionary const& dict, } int main(int argc, char** argv) { - int mandatory = 2 + 1; - if (argc < mandatory) { - std::cout << argv[0] << " < queries" - << std::endl; - return 1; - } + cmd_line_parser::parser parser(argc, argv); + parser.add("collection_basename", "Collection basename."); + parser.add("max_num_queries", "Maximum number of queries to execute."); + if (!parser.parse()) return 1; parameters params; - params.collection_basename = argv[1]; + params.collection_basename = parser.get("collection_basename"); params.load(); - uint32_t num_queries = std::atoi(argv[2]); + auto max_num_queries = parser.get("max_num_queries"); essentials::logger("loading queries..."); std::vector queries; - queries.reserve(num_queries); + queries.reserve(max_num_queries); std::string query; query.reserve(2 * constants::MAX_NUM_CHARS_PER_QUERY); - for (uint32_t i = 0; i != num_queries; ++i) { + for (uint32_t i = 0; i != max_num_queries; ++i) { if (!std::getline(std::cin, query)) break; queries.push_back(std::move(query)); } - num_queries = queries.size(); - essentials::logger("loaded " + std::to_string(num_queries) + " queries"); + max_num_queries = queries.size(); + essentials::logger("loaded " + std::to_string(max_num_queries) + + " queries"); exe(4) exe(8) exe(16) exe(32) exe(64) exe(128) exe(256) return 0; } \ No newline at end of file diff --git a/benchmark/benchmark_integer_fc_dictionary.cpp b/archive/benchmark/benchmark_integer_fc_dictionary.cpp similarity index 94% rename from benchmark/benchmark_integer_fc_dictionary.cpp rename to archive/benchmark/benchmark_integer_fc_dictionary.cpp index f1e35d9..8cb2b32 100644 --- a/benchmark/benchmark_integer_fc_dictionary.cpp +++ b/archive/benchmark/benchmark_integer_fc_dictionary.cpp @@ -8,10 +8,10 @@ using namespace autocomplete; template void perf_test(Dictionary const& dict, std::vector const& queries) { - completion_type decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY); + static completion_type decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY); essentials::timer_type timer; - for (uint32_t i = 0; i != runs; ++i) { + for (uint32_t i = 0; i != benchmarking::runs; ++i) { timer.start(); for (auto const& id : queries) { uint8_t string_len = dict.extract(id, decoded); diff --git a/archive/benchmark/benchmark_locate_prefix.cpp b/archive/benchmark/benchmark_locate_prefix.cpp new file mode 100644 index 0000000..a9e374a --- /dev/null +++ b/archive/benchmark/benchmark_locate_prefix.cpp @@ -0,0 +1,111 @@ +#include + +#include "types.hpp" +#include "statistics.hpp" +#include "benchmark_common.hpp" + +using namespace autocomplete; + +typedef std::pair query_type; + +template +void benchmark(parameters const& params, std::vector& queries, + uint32_t num_queries, uint32_t num_terms_per_query, float keep) { + essentials::json_lines result; + result.new_line(); + result.add("num_terms_per_query", std::to_string(num_terms_per_query)); + result.add("percentage", std::to_string(keep)); + result.add("num_queries", std::to_string(num_queries)); + + Index index; + { + typename Index::builder builder(params); + builder.build(index); + } + + result.add("MiB", std::to_string(static_cast(index.bytes()) / + essentials::MiB)); + result.add( + "bytes_per_completion", + std::to_string(static_cast(index.bytes()) / index.size())); + + essentials::timer_type timer; + timer.start(); + for (uint32_t run = 0; run != benchmarking::runs; ++run) { + for (auto& query : queries) { + auto r = index.locate_prefix(query.first, query.second); + essentials::do_not_optimize_away(r.end - r.begin); + } + } + timer.stop(); + result.add( + "musec_per_query", + std::to_string(timer.elapsed() / (benchmarking::runs * num_queries))); + result.print(); +} + +int main(int argc, char** argv) { + cmd_line_parser::parser parser(argc, argv); + parser.add("type", "Index type."); + parser.add("collection_basename", "Collection basename."); + parser.add("num_terms_per_query", "Number of terms per query."); + parser.add("max_num_queries", "Maximum number of queries to execute."); + parser.add("percentage", + "A float in [0,1] specifying how much we keep of the last token " + "in a query."); + if (!parser.parse()) return 1; + + parameters params; + params.collection_basename = parser.get("collection_basename"); + params.load(); + + auto type = parser.get("type"); + auto max_num_queries = parser.get("max_num_queries"); + auto num_terms_per_query = parser.get("num_terms_per_query"); + auto keep = parser.get("percentage"); + + fc_dictionary_type dict; + { + fc_dictionary_type::builder builder(params); + builder.build(dict); + } + + std::vector strings; + std::vector queries; + uint32_t num_queries = 0; + + { + num_queries = load_queries(strings, max_num_queries, keep, std::cin); + for (auto const& string : strings) { + completion_type prefix; + byte_range suffix; + parse(dict, string, prefix, suffix, true); + range suffix_lex_range = dict.locate_prefix(suffix); + queries.emplace_back(prefix, suffix_lex_range); + } + } + + if (type == "trie") { + benchmark(params, queries, num_queries, + num_terms_per_query, keep); + } else if (type == "fc") { + // benchmark>(params, queries, num_queries, + // num_terms_per_query, keep); + // benchmark>(params, queries, num_queries, + // num_terms_per_query, keep); + benchmark>(params, queries, num_queries, + num_terms_per_query, keep); + // benchmark>(params, queries, num_queries, + // num_terms_per_query, keep); + // benchmark>(params, queries, num_queries, + // num_terms_per_query, keep); + // benchmark>(params, queries, num_queries, + // num_terms_per_query, keep); + // benchmark>(params, queries, num_queries, + // num_terms_per_query, keep); + } else { + return 1; + } + + return 0; +} \ No newline at end of file diff --git a/archive/benchmark/benchmark_prefix_topk.cpp b/archive/benchmark/benchmark_prefix_topk.cpp new file mode 100644 index 0000000..69a0bc1 --- /dev/null +++ b/archive/benchmark/benchmark_prefix_topk.cpp @@ -0,0 +1,7 @@ +#include + +#include "types.hpp" +#include "benchmark_common.hpp" + +using namespace autocomplete; +BENCHMARK(prefix_) \ No newline at end of file diff --git a/archive/benchmark/benchmark_topk.cpp b/archive/benchmark/benchmark_topk.cpp new file mode 100644 index 0000000..98d208c --- /dev/null +++ b/archive/benchmark/benchmark_topk.cpp @@ -0,0 +1,7 @@ +#include + +#include "types.hpp" +#include "benchmark_common.hpp" + +using namespace autocomplete; +BENCHMARK("") \ No newline at end of file diff --git a/archive/benchmark/effectiveness.cpp b/archive/benchmark/effectiveness.cpp new file mode 100644 index 0000000..e9c6590 --- /dev/null +++ b/archive/benchmark/effectiveness.cpp @@ -0,0 +1,137 @@ +#include + +#include "types.hpp" +#include "benchmark_common.hpp" + +using namespace autocomplete; + +template +void benchmark(std::string const& index_filename, uint32_t k, + uint32_t max_num_queries, float keep, + essentials::json_lines& stats, bool verbose) { + Index index1, index2; + essentials::load(index1, index_filename.c_str()); + essentials::load(index2, index_filename.c_str()); + + std::vector queries; + uint32_t num_queries = + load_queries(queries, max_num_queries, keep, std::cin); + uint64_t strings_reported_by_prefix_search = 0; + uint64_t better_scored_strings_reported_by_conjunctive_search = 0; + + stats.add("num_queries", std::to_string(num_queries)); + + std::vector difference; + difference.reserve(k); + nop_probe probe; + + for (auto const& query : queries) { + auto it1 = index1.prefix_topk(query, k, probe); + auto it2 = index2.conjunctive_topk(query, k, probe); + strings_reported_by_prefix_search += it1.size(); + + uint64_t more = 0; + assert(it2.size() >= it1.size()); + + auto const& prefix_search_scores = it1.pool()->const_scores(); + auto const& conjunctive_search_scores = it2.pool()->const_scores(); + assert(std::is_sorted(prefix_search_scores.begin(), + prefix_search_scores.begin() + it1.size())); + assert(std::is_sorted(conjunctive_search_scores.begin(), + conjunctive_search_scores.begin() + it2.size())); + + if (verbose) { + std::cout << "query: '" << query << "'" << std::endl; + { + auto it = it1; + std::cout << "prefix_search results: " << it.size() + << std::endl; + for (uint64_t i = 0; i != it.size(); ++i, ++it) { + auto completion = *it; + std::cout << completion.score << ": " + << std::string(completion.string.begin, + completion.string.end) + << std::endl; + } + } + { + auto it = it2; + std::cout << "conjunctive_search results: " << it.size() + << std::endl; + for (uint64_t i = 0; i != it.size(); ++i, ++it) { + auto completion = *it; + std::cout << completion.score << ": " + << std::string(completion.string.begin, + completion.string.end) + << std::endl; + } + } + } + + difference.clear(); + auto it = std::set_difference( + conjunctive_search_scores.begin(), + conjunctive_search_scores.begin() + it2.size(), + prefix_search_scores.begin(), + prefix_search_scores.begin() + it1.size(), difference.begin()); + more = std::distance(difference.begin(), it); + if (verbose) std::cout << "more: " << more << std::endl; + better_scored_strings_reported_by_conjunctive_search += more; + } + + stats.add("strings_reported_by_prefix_search", + std::to_string(strings_reported_by_prefix_search)); + stats.add( + "better_scored_strings_reported_by_conjunctive_search", + std::to_string(better_scored_strings_reported_by_conjunctive_search)); + stats.add( + "better_scored_strings_reported_by_conjunctive_search_in_percentage", + std::to_string(better_scored_strings_reported_by_conjunctive_search * + 100.0 / strings_reported_by_prefix_search)); +} + +int main(int argc, char** argv) { + cmd_line_parser::parser parser(argc, argv); + parser.add("type", "Index type."); + parser.add("k", "top-k value."); + parser.add("index_filename", "Index filename."); + parser.add("num_terms_per_query", "Number of terms per query."); + parser.add("max_num_queries", "Maximum number of queries to execute."); + parser.add("percentage", + "A float in [0,1] specifying how much we keep of the last token " + "in a query: n x 100 <=> n%, for n in [0,1]."); + parser.add("verbose", "Verbose output.", "--verbose"); + if (!parser.parse()) return 1; + + auto type = parser.get("type"); + auto k = parser.get("k"); + auto index_filename = parser.get("index_filename"); + auto max_num_queries = parser.get("max_num_queries"); + auto keep = parser.get("percentage"); + auto verbose = parser.get("verbose"); + + essentials::json_lines stats; + stats.new_line(); + stats.add("num_terms_per_query", + parser.get("num_terms_per_query")); + stats.add("percentage", std::to_string(keep)); + + if (type == "ef_type1") { + benchmark(index_filename, k, max_num_queries, + keep, stats, verbose); + } else if (type == "ef_type2") { + benchmark(index_filename, k, max_num_queries, + keep, stats, verbose); + } else if (type == "ef_type3") { + benchmark(index_filename, k, max_num_queries, + keep, stats, verbose); + } else if (type == "ef_type4") { + benchmark(index_filename, k, max_num_queries, + keep, stats, verbose); + } else { + return 1; + } + + stats.print(); + return 0; +} \ No newline at end of file diff --git a/archive/example.sh b/archive/example.sh new file mode 100644 index 0000000..4ac00bf --- /dev/null +++ b/archive/example.sh @@ -0,0 +1,3 @@ +cd build +./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec_05.ef_type1.bin +./web_server 8000 trec_05.ef_type1.bin \ No newline at end of file diff --git a/archive/external/CMakeLists.txt b/archive/external/CMakeLists.txt new file mode 100644 index 0000000..5d0ee92 --- /dev/null +++ b/archive/external/CMakeLists.txt @@ -0,0 +1,4 @@ +include_directories(essentials/include) + +set(DOCTEST_INCLUDE_DIR ${AUTOCOMPLETE_SOURCE_DIR}/external/doctest) +include_directories(${DOCTEST_INCLUDE_DIR}) \ No newline at end of file diff --git a/archive/include/autocomplete.hpp b/archive/include/autocomplete.hpp new file mode 100644 index 0000000..78e54ad --- /dev/null +++ b/archive/include/autocomplete.hpp @@ -0,0 +1,223 @@ +#pragma once + +#include "util_types.hpp" +#include "autocomplete_common.hpp" +#include "scored_string_pool.hpp" +#include "constants.hpp" + +namespace autocomplete { + +template +struct autocomplete { + typedef scored_string_pool::iterator iterator_type; + + autocomplete() { + m_pool.resize(constants::POOL_SIZE, constants::MAX_K); + } + + autocomplete(parameters const& params) + : autocomplete() { + typename Completions::builder cm_builder(params); + typename Dictionary::builder di_builder(params); + typename InvertedIndex::builder ii_builder(params); + typename ForwardIndex::builder fi_builder(params); + + m_unsorted_docs_list.build(cm_builder.doc_ids()); + m_unsorted_minimal_docs_list.build(ii_builder.minimal_doc_ids()); + + cm_builder.build(m_completions); + di_builder.build(m_dictionary); + ii_builder.build(m_inverted_index); + fi_builder.build(m_forward_index); + } + + template + iterator_type prefix_topk(std::string const& query, const uint32_t k, + Probe& probe) { + assert(k <= constants::MAX_K); + + probe.start(0); + init(); + completion_type prefix; + byte_range suffix; + constexpr bool must_find_prefix = true; + if (!parse(m_dictionary, query, prefix, suffix, must_find_prefix)) { + return m_pool.begin(); + } + probe.stop(0); + + probe.start(1); + range suffix_lex_range = m_dictionary.locate_prefix(suffix); + if (suffix_lex_range.is_invalid()) return m_pool.begin(); + suffix_lex_range.begin += 1; + suffix_lex_range.end += 1; + range r = m_completions.locate_prefix(prefix, suffix_lex_range); + if (r.is_invalid()) return m_pool.begin(); + uint32_t num_completions = + m_unsorted_docs_list.topk(r, k, m_pool.scores()); + probe.stop(1); + + probe.start(2); + auto it = extract_strings(num_completions); + probe.stop(2); + + return it; + } + + template + iterator_type conjunctive_topk(std::string const& query, const uint32_t k, + Probe& probe) { + assert(k <= constants::MAX_K); + + probe.start(0); + init(); + completion_type prefix; + byte_range suffix; + constexpr bool must_find_prefix = false; + parse(m_dictionary, query, prefix, suffix, must_find_prefix); + probe.stop(0); + + probe.start(1); + range suffix_lex_range = m_dictionary.locate_prefix(suffix); + if (suffix_lex_range.is_invalid()) return m_pool.begin(); + uint32_t num_completions = 0; + if (prefix.size() == 0) { + suffix_lex_range.end += 1; + num_completions = m_unsorted_minimal_docs_list.topk( + m_inverted_index, suffix_lex_range, k, m_pool.scores()); + } else { + suffix_lex_range.begin += 1; + suffix_lex_range.end += 1; + num_completions = conjunctive_topk(prefix, suffix_lex_range, k); + } + probe.stop(1); + + probe.start(2); + auto it = extract_strings(num_completions); + probe.stop(2); + + return it; + } + + // iterator_type topk(std::string const& query, const uint32_t k) { + // assert(k <= constants::MAX_K); + // init(); + // completion_type prefix; + // byte_range suffix; + // uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); + // assert(num_terms > 0); + + // range suffix_lex_range = m_dictionary.locate_prefix(suffix); + // if (suffix_lex_range.is_invalid()) return m_pool.begin(); + + // suffix_lex_range.begin += 1; + // suffix_lex_range.end += 1; + // range r = m_completions.locate_prefix(prefix, suffix_lex_range); + + // uint32_t num_completions = 0; + // if (r.is_valid()) { + // num_completions = m_unsorted_docs_list.topk(r, k, + // m_pool.scores()); + // } + + // if (num_completions < k) { + // if (num_terms == 1) { // special case + // suffix_lex_range.begin -= 1; + // num_completions = m_unsorted_minimal_docs_list.topk( + // suffix_lex_range, k, m_pool.scores(), + // true // must return unique results + // ); + // } else { + // num_completions = conjunctive_topk(prefix, suffix_lex_range, + // k); + // } + // } + + // return extract_strings(num_completions); + // } + + size_t bytes() const { + return m_completions.bytes() + m_unsorted_docs_list.bytes() + + m_unsorted_minimal_docs_list.bytes() + m_dictionary.bytes() + + m_inverted_index.bytes() + m_forward_index.bytes(); + } + + void print_stats() const; + + template + void visit(Visitor& visitor) { + visitor.visit(m_completions); + visitor.visit(m_unsorted_docs_list); + visitor.visit(m_unsorted_minimal_docs_list); + visitor.visit(m_dictionary); + visitor.visit(m_inverted_index); + visitor.visit(m_forward_index); + } + +private: + Completions m_completions; + unsorted_list_type m_unsorted_docs_list; + typedef minimal_docids minimal_docids_type; + minimal_docids_type m_unsorted_minimal_docs_list; + Dictionary m_dictionary; + InvertedIndex m_inverted_index; + ForwardIndex m_forward_index; + + scored_string_pool m_pool; + + void init() { + m_pool.clear(); + m_pool.init(); + assert(m_pool.size() == 0); + } + + uint32_t conjunctive_topk(completion_type& prefix, const range suffix, + uint32_t const k) { + deduplicate(prefix); + if (prefix.size() == 1) { // we've got nothing to intersect + auto it = m_inverted_index.iterator(prefix.front() - 1); + return conjunctive_topk(it, suffix, k); + } + auto it = m_inverted_index.intersection_iterator(prefix); + return conjunctive_topk(it, suffix, k); + } + + template + uint32_t conjunctive_topk(Iterator& it, const range r, uint32_t const k) { + auto& topk_scores = m_pool.scores(); + uint32_t results = 0; + for (; it.has_next(); ++it) { + auto doc_id = *it; + if (m_forward_index.intersects(doc_id, r)) { + topk_scores[results++] = doc_id; + if (results == k) break; + } + } + return results; + } + + iterator_type extract_strings(const uint32_t num_completions) { + auto const& topk_scores = m_pool.scores(); + for (uint32_t i = 0; i != num_completions; ++i) { + auto doc_id = topk_scores[i]; + auto it = m_forward_index.iterator(doc_id); + uint64_t offset = m_pool.bytes(); + uint8_t* decoded = m_pool.data() + offset; + for (uint32_t j = 0; j != it.size(); ++j, ++it) { + auto term_id = *it; + uint8_t len = m_dictionary.extract(term_id, decoded); + decoded += len; + offset += len; + if (j != it.size() - 1) { + *decoded++ = ' '; + offset++; + } + } + m_pool.push_back_offset(offset); + } + assert(m_pool.size() == num_completions); + return m_pool.begin(); + } +}; +} // namespace autocomplete \ No newline at end of file diff --git a/archive/include/autocomplete2.hpp b/archive/include/autocomplete2.hpp new file mode 100644 index 0000000..eb3f994 --- /dev/null +++ b/archive/include/autocomplete2.hpp @@ -0,0 +1,256 @@ +#pragma once + +#include "util_types.hpp" +#include "building_util.hpp" +#include "compact_vector.hpp" +#include "autocomplete_common.hpp" +#include "scored_string_pool.hpp" +#include "constants.hpp" + +namespace autocomplete { + +template +struct autocomplete2 { + typedef scored_string_pool::iterator iterator_type; + + autocomplete2() { + m_pool.resize(constants::POOL_SIZE, constants::MAX_K); + m_topk_completion_set.resize(constants::MAX_K, + 2 * constants::MAX_NUM_TERMS_PER_QUERY); + } + + autocomplete2(parameters const& params) + : autocomplete2() { + typename Completions::builder cm_builder(params); + typename Dictionary::builder di_builder(params); + typename InvertedIndex::builder ii_builder(params); + auto const& docid_to_lexid = cm_builder.docid_to_lexid(); + m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(), + util::ceil_log2(params.num_completions + 1)); + m_unsorted_docs_list.build( + util::invert(docid_to_lexid, params.num_completions)); + m_unsorted_minimal_docs_list.build(ii_builder.minimal_doc_ids()); + cm_builder.build(m_completions); + di_builder.build(m_dictionary); + ii_builder.build(m_inverted_index); + } + + template + iterator_type prefix_topk(std::string const& query, const uint32_t k, + Probe& probe) { + assert(k <= constants::MAX_K); + + probe.start(0); + init(); + completion_type prefix; + byte_range suffix; + constexpr bool must_find_prefix = true; + if (!parse(m_dictionary, query, prefix, suffix, must_find_prefix)) { + return m_pool.begin(); + } + probe.stop(0); + + probe.start(1); + range suffix_lex_range = m_dictionary.locate_prefix(suffix); + if (suffix_lex_range.is_invalid()) return m_pool.begin(); + suffix_lex_range.begin += 1; + suffix_lex_range.end += 1; + range r = m_completions.locate_prefix(prefix, suffix_lex_range); + if (r.is_invalid()) return m_pool.begin(); + uint32_t num_completions = + m_unsorted_docs_list.topk(r, k, m_pool.scores()); + probe.stop(1); + + probe.start(2); + extract_completions(num_completions); + auto it = extract_strings(num_completions); + probe.stop(2); + + return it; + } + + template + iterator_type conjunctive_topk(std::string const& query, const uint32_t k, + Probe& probe) { + assert(k <= constants::MAX_K); + + probe.start(0); + init(); + completion_type prefix; + byte_range suffix; + constexpr bool must_find_prefix = false; + parse(m_dictionary, query, prefix, suffix, must_find_prefix); + probe.stop(0); + + probe.start(1); + range suffix_lex_range = m_dictionary.locate_prefix(suffix); + if (suffix_lex_range.is_invalid()) return m_pool.begin(); + uint32_t num_completions = 0; + if (prefix.size() == 0) { + suffix_lex_range.end += 1; + num_completions = m_unsorted_minimal_docs_list.topk( + m_inverted_index, suffix_lex_range, k, m_pool.scores()); + extract_completions(num_completions); + } else { + suffix_lex_range.begin += 1; + suffix_lex_range.end += 1; + num_completions = conjunctive_topk(prefix, suffix_lex_range, k); + } + probe.stop(1); + + probe.start(2); + auto it = extract_strings(num_completions); + probe.stop(2); + + return it; + } + + // iterator_type topk(std::string const& query, const uint32_t k) { + // assert(k <= constants::MAX_K); + // init(); + // completion_type prefix; + // byte_range suffix; + // uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); + // assert(num_terms > 0); + + // range suffix_lex_range = m_dictionary.locate_prefix(suffix); + // if (suffix_lex_range.is_invalid()) return m_pool.begin(); + + // suffix_lex_range.begin += 1; + // suffix_lex_range.end += 1; + // range r = m_completions.locate_prefix(prefix, suffix_lex_range); + + // uint32_t num_completions = 0; + // if (r.is_valid()) { + // num_completions = m_unsorted_docs_list.topk(r, k, + // m_pool.scores()); + // } + + // if (num_completions < k) { + // if (num_terms == 1) { // special case + // suffix_lex_range.begin -= 1; + // num_completions = m_unsorted_minimal_docs_list.topk( + // suffix_lex_range, k, m_pool.scores(), + // true // must return unique results + // ); + // extract_completions(num_completions); + // } else { + // num_completions = conjunctive_topk(prefix, suffix_lex_range, + // k); + // } + // } else { + // extract_completions(num_completions); + // } + + // return extract_strings(num_completions); + // } + + size_t bytes() const { + return m_completions.bytes() + m_unsorted_docs_list.bytes() + + m_unsorted_minimal_docs_list.bytes() + m_dictionary.bytes() + + m_docid_to_lexid.bytes() + m_inverted_index.bytes(); + } + + void print_stats() const; + + template + void visit(Visitor& visitor) { + visitor.visit(m_completions); + visitor.visit(m_unsorted_docs_list); + visitor.visit(m_unsorted_minimal_docs_list); + visitor.visit(m_dictionary); + visitor.visit(m_inverted_index); + visitor.visit(m_docid_to_lexid); + } + +private: + Completions m_completions; + unsorted_list_type m_unsorted_docs_list; + typedef minimal_docids minimal_docids_type; + minimal_docids_type m_unsorted_minimal_docs_list; + Dictionary m_dictionary; + InvertedIndex m_inverted_index; + compact_vector m_docid_to_lexid; + + scored_string_pool m_pool; + completion_set m_topk_completion_set; + + void init() { + m_pool.clear(); + m_pool.init(); + assert(m_pool.size() == 0); + } + + void extract_completions(const uint32_t num_completions) { + auto const& topk_scores = m_pool.scores(); + auto& completions = m_topk_completion_set.completions(); + auto& sizes = m_topk_completion_set.sizes(); + for (uint32_t i = 0; i != num_completions; ++i) { + auto doc_id = topk_scores[i]; + auto lex_id = m_docid_to_lexid[doc_id]; + uint8_t size = m_completions.extract(lex_id, completions[i]); + sizes[i] = size; + } + } + + uint32_t conjunctive_topk(completion_type& prefix, const range suffix, + uint32_t const k) { + deduplicate(prefix); + if (prefix.size() == 1) { // we've got nothing to intersect + auto it = m_inverted_index.iterator(prefix.front() - 1); + return conjunctive_topk(it, suffix, k); + } + auto it = m_inverted_index.intersection_iterator(prefix); + return conjunctive_topk(it, suffix, k); + } + + template + uint32_t conjunctive_topk(Iterator& it, const range r, const uint32_t k) { + auto& topk_scores = m_pool.scores(); + auto& completions = m_topk_completion_set.completions(); + auto& sizes = m_topk_completion_set.sizes(); + uint32_t i = 0; + + for (; it.has_next(); ++it) { + auto doc_id = *it; + auto lex_id = m_docid_to_lexid[doc_id]; + uint32_t size = m_completions.extract(lex_id, completions[i]); + for (uint32_t j = 0; j != size; ++j) { + if (r.contains(completions[i][j])) { + topk_scores[i] = doc_id; + sizes[i] = size; + ++i; + if (i == k) return k; + break; + } + } + } + + return i; + } + + iterator_type extract_strings(const uint32_t num_completions) { + auto const& completions = m_topk_completion_set.completions(); + auto const& sizes = m_topk_completion_set.sizes(); + for (uint32_t i = 0; i != num_completions; ++i) { + auto const& c = completions[i]; + uint32_t size = sizes[i]; + uint64_t offset = m_pool.bytes(); + uint8_t* decoded = m_pool.data() + offset; + for (uint32_t j = 0; j != size; ++j) { + auto term_id = c[j]; + uint8_t len = m_dictionary.extract(term_id, decoded); + decoded += len; + offset += len; + if (j != size - 1) { + *decoded++ = ' '; + offset++; + } + } + m_pool.push_back_offset(offset); + } + assert(m_pool.size() == num_completions); + return m_pool.begin(); + } +}; +} // namespace autocomplete \ No newline at end of file diff --git a/archive/include/autocomplete3.hpp b/archive/include/autocomplete3.hpp new file mode 100644 index 0000000..6765ad6 --- /dev/null +++ b/archive/include/autocomplete3.hpp @@ -0,0 +1,264 @@ +#pragma once + +#include "util_types.hpp" +#include "building_util.hpp" +#include "compact_vector.hpp" +#include "autocomplete_common.hpp" +#include "scored_string_pool.hpp" +#include "constants.hpp" + +namespace autocomplete { + +/* +During the conjunctive step, maintain a min-heap of iterators, +one iterator for each termID in the lexicographic range of the +last token of the query. +*/ + +template +struct autocomplete3 { + typedef scored_string_pool::iterator iterator_type; + typedef min_heap> + min_priority_queue_type; + + autocomplete3() { + m_pool.resize(constants::POOL_SIZE, constants::MAX_K); + m_topk_completion_set.resize(constants::MAX_K, + 2 * constants::MAX_NUM_TERMS_PER_QUERY); + } + + autocomplete3(parameters const& params) + : autocomplete3() { + typename Completions::builder cm_builder(params); + typename Dictionary::builder di_builder(params); + typename InvertedIndex::builder ii_builder(params); + auto const& docid_to_lexid = cm_builder.docid_to_lexid(); + m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(), + util::ceil_log2(params.num_completions + 1)); + m_unsorted_docs_list.build( + util::invert(docid_to_lexid, params.num_completions)); + cm_builder.build(m_completions); + di_builder.build(m_dictionary); + ii_builder.build(m_inverted_index); + } + + template + iterator_type prefix_topk(std::string const& query, const uint32_t k, + Probe& probe) { + assert(k <= constants::MAX_K); + + probe.start(0); + init(); + completion_type prefix; + byte_range suffix; + constexpr bool must_find_prefix = true; + if (!parse(m_dictionary, query, prefix, suffix, must_find_prefix)) { + return m_pool.begin(); + } + probe.stop(0); + + probe.start(1); + range suffix_lex_range = m_dictionary.locate_prefix(suffix); + if (suffix_lex_range.is_invalid()) return m_pool.begin(); + suffix_lex_range.begin += 1; + suffix_lex_range.end += 1; + range r = m_completions.locate_prefix(prefix, suffix_lex_range); + if (r.is_invalid()) return m_pool.begin(); + uint32_t num_completions = + m_unsorted_docs_list.topk(r, k, m_pool.scores()); + probe.stop(1); + + probe.start(2); + extract_completions(num_completions); + auto it = extract_strings(num_completions); + probe.stop(2); + + return it; + } + + template + iterator_type conjunctive_topk(std::string const& query, const uint32_t k, + Probe& probe) { + assert(k <= constants::MAX_K); + + probe.start(0); + init(); + completion_type prefix; + byte_range suffix; + constexpr bool must_find_prefix = false; + parse(m_dictionary, query, prefix, suffix, must_find_prefix); + probe.stop(0); + + probe.start(1); + uint32_t num_completions = 0; + range suffix_lex_range = m_dictionary.locate_prefix(suffix); + if (suffix_lex_range.is_invalid()) return m_pool.begin(); + suffix_lex_range.begin += 1; + suffix_lex_range.end += 1; + num_completions = conjunctive_topk(prefix, suffix_lex_range, k); + probe.stop(1); + + probe.start(2); + extract_completions(num_completions); + auto it = extract_strings(num_completions); + probe.stop(2); + + return it; + } + + // iterator_type topk(std::string const& query, const uint32_t k) { + // assert(k <= constants::MAX_K); + // init(); + // completion_type prefix; + // byte_range suffix; + // uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); + // assert(num_terms > 0); + + // range suffix_lex_range = m_dictionary.locate_prefix(suffix); + // if (suffix_lex_range.is_invalid()) return m_pool.begin(); + + // suffix_lex_range.begin += 1; + // suffix_lex_range.end += 1; + // range r = m_completions.locate_prefix(prefix, suffix_lex_range); + + // uint32_t num_completions = 0; + // if (r.is_valid()) { + // num_completions = m_unsorted_docs_list.topk(r, k, + // m_pool.scores()); + // } + + // if (num_completions < k) { + // num_completions = + // conjunctive_topk(num_terms, prefix, suffix_lex_range, k); + // } + + // extract_completions(num_completions); + // return extract_strings(num_completions); + // } + + size_t bytes() const { + return m_completions.bytes() + m_unsorted_docs_list.bytes() + + m_dictionary.bytes() + m_docid_to_lexid.bytes() + + m_inverted_index.bytes(); + } + + void print_stats() const; + + template + void visit(Visitor& visitor) { + visitor.visit(m_completions); + visitor.visit(m_unsorted_docs_list); + visitor.visit(m_dictionary); + visitor.visit(m_inverted_index); + visitor.visit(m_docid_to_lexid); + } + +private: + Completions m_completions; + unsorted_list_type m_unsorted_docs_list; + Dictionary m_dictionary; + InvertedIndex m_inverted_index; + compact_vector m_docid_to_lexid; + + scored_string_pool m_pool; + completion_set m_topk_completion_set; + + void init() { + m_pool.clear(); + m_pool.init(); + assert(m_pool.size() == 0); + } + + void extract_completions(const uint32_t num_completions) { + auto const& topk_scores = m_pool.scores(); + auto& completions = m_topk_completion_set.completions(); + auto& sizes = m_topk_completion_set.sizes(); + for (uint32_t i = 0; i != num_completions; ++i) { + auto doc_id = topk_scores[i]; + auto lex_id = m_docid_to_lexid[doc_id]; + uint8_t size = m_completions.extract(lex_id, completions[i]); + sizes[i] = size; + } + } + + uint32_t conjunctive_topk(completion_type& prefix, + const range suffix_lex_range, const uint32_t k) { + if (prefix.size() == 0) { // we've got nothing to intersect + return heap_topk(m_inverted_index, suffix_lex_range, k, + m_pool.scores()); + } + deduplicate(prefix); + if (prefix.size() == 1) { // we've got nothing to intersect + auto it = m_inverted_index.iterator(prefix.front() - 1); + return conjunctive_topk(it, suffix_lex_range, k); + } + auto it = m_inverted_index.intersection_iterator(prefix); + return conjunctive_topk(it, suffix_lex_range, k); + } + + template + uint32_t conjunctive_topk(Iterator& it, const range r, const uint32_t k) { + assert(r.is_valid()); + + auto& topk_scores = m_pool.scores(); + min_priority_queue_type q; + q.reserve(r.end - r.begin + 1); // inclusive range + assert(r.begin > 0); + for (uint64_t term_id = r.begin; term_id <= r.end; ++term_id) { + q.push_back(m_inverted_index.iterator(term_id - 1)); + } + q.make_heap(); + + uint32_t results = 0; + for (; it.has_next() and !q.empty(); ++it) { + auto doc_id = *it; + while (!q.empty()) { + auto& z = q.top(); + auto val = *z; + if (val > doc_id) break; + if (val < doc_id) { + val = z.next_geq(doc_id); + if (!z.has_next()) { + q.pop(); + } else { + q.heapify(); + } + } + if (val == doc_id) { // NOTE: putting else here seems to slow + // down the code! + topk_scores[results++] = doc_id; + if (results == k) return results; + break; + } + } + } + + return results; + } + + iterator_type extract_strings(const uint32_t num_completions) { + auto const& completions = m_topk_completion_set.completions(); + auto const& sizes = m_topk_completion_set.sizes(); + for (uint32_t i = 0; i != num_completions; ++i) { + auto const& c = completions[i]; + uint32_t size = sizes[i]; + uint64_t offset = m_pool.bytes(); + uint8_t* decoded = m_pool.data() + offset; + for (uint32_t j = 0; j != size; ++j) { + auto term_id = c[j]; + uint8_t len = m_dictionary.extract(term_id, decoded); + decoded += len; + offset += len; + if (j != size - 1) { + *decoded++ = ' '; + offset++; + } + } + m_pool.push_back_offset(offset); + } + assert(m_pool.size() == num_completions); + return m_pool.begin(); + } +}; +} // namespace autocomplete \ No newline at end of file diff --git a/archive/include/autocomplete4.hpp b/archive/include/autocomplete4.hpp new file mode 100644 index 0000000..7d84bae --- /dev/null +++ b/archive/include/autocomplete4.hpp @@ -0,0 +1,290 @@ +#pragma once + +#include "util_types.hpp" +#include "building_util.hpp" +#include "compact_vector.hpp" +#include "autocomplete_common.hpp" +#include "scored_string_pool.hpp" +#include "constants.hpp" + +namespace autocomplete { + +/* Bast and Weber approach. */ + +template +struct autocomplete4 { + typedef scored_string_pool::iterator iterator_type; + + autocomplete4() { + m_pool.resize(constants::POOL_SIZE, constants::MAX_K); + m_topk_completion_set.resize(constants::MAX_K, + 2 * constants::MAX_NUM_TERMS_PER_QUERY); + } + + autocomplete4(parameters const& params, float c) + : autocomplete4() { + typename Completions::builder cm_builder(params); + typename Dictionary::builder di_builder(params); + typename BlockedInvertedIndex::builder ii_builder(params, c); + auto const& docid_to_lexid = cm_builder.docid_to_lexid(); + m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(), + util::ceil_log2(params.num_completions + 1)); + m_unsorted_docs_list.build( + util::invert(docid_to_lexid, params.num_completions)); + cm_builder.build(m_completions); + di_builder.build(m_dictionary); + ii_builder.build(m_inverted_index); + } + + template + iterator_type prefix_topk(std::string const& query, const uint32_t k, + Probe& probe) { + assert(k <= constants::MAX_K); + + probe.start(0); + init(); + completion_type prefix; + byte_range suffix; + constexpr bool must_find_prefix = true; + if (!parse(m_dictionary, query, prefix, suffix, must_find_prefix)) { + return m_pool.begin(); + } + probe.stop(0); + + probe.start(1); + range suffix_lex_range = m_dictionary.locate_prefix(suffix); + if (suffix_lex_range.is_invalid()) return m_pool.begin(); + suffix_lex_range.begin += 1; + suffix_lex_range.end += 1; + range r = m_completions.locate_prefix(prefix, suffix_lex_range); + if (r.is_invalid()) return m_pool.begin(); + uint32_t num_completions = + m_unsorted_docs_list.topk(r, k, m_pool.scores()); + probe.stop(1); + + probe.start(2); + extract_completions(num_completions); + auto it = extract_strings(num_completions); + probe.stop(2); + + return it; + } + + template + iterator_type conjunctive_topk(std::string const& query, const uint32_t k, + Probe& probe) { + assert(k <= constants::MAX_K); + + probe.start(0); + init(); + completion_type prefix; + byte_range suffix; + constexpr bool must_find_prefix = false; + parse(m_dictionary, query, prefix, suffix, must_find_prefix); + probe.stop(0); + + probe.start(1); + range suffix_lex_range = m_dictionary.locate_prefix(suffix); + if (suffix_lex_range.is_invalid()) return m_pool.begin(); + suffix_lex_range.begin += 1; + suffix_lex_range.end += 1; + uint32_t num_completions = + conjunctive_topk(prefix, suffix_lex_range, k); + probe.stop(1); + + probe.start(2); + extract_completions(num_completions); + auto it = extract_strings(num_completions); + probe.stop(2); + + return it; + } + + // iterator_type topk(std::string const& query, const uint32_t k) { + // assert(k <= constants::MAX_K); + // init(); + // completion_type prefix; + // byte_range suffix; + // parse(m_dictionary, query, prefix, suffix); + + // range suffix_lex_range = m_dictionary.locate_prefix(suffix); + // if (suffix_lex_range.is_invalid()) return m_pool.begin(); + + // suffix_lex_range.begin += 1; + // suffix_lex_range.end += 1; + // range r = m_completions.locate_prefix(prefix, suffix_lex_range); + + // uint32_t num_completions = 0; + // if (r.is_valid()) { + // num_completions = m_unsorted_docs_list.topk(r, k, + // m_pool.scores()); + // } + + // if (num_completions < k) { + // num_completions = conjunctive_topk(prefix, suffix_lex_range, k); + // } + + // extract_completions(num_completions); + // return extract_strings(num_completions); + // } + + size_t bytes() const { + return m_completions.bytes() + m_unsorted_docs_list.bytes() + + m_dictionary.bytes() + m_docid_to_lexid.bytes() + + m_inverted_index.bytes(); + } + + void print_stats() const; + + template + void visit(Visitor& visitor) { + visitor.visit(m_completions); + visitor.visit(m_unsorted_docs_list); + visitor.visit(m_dictionary); + visitor.visit(m_inverted_index); + visitor.visit(m_docid_to_lexid); + } + +private: + Completions m_completions; + unsorted_list_type m_unsorted_docs_list; + Dictionary m_dictionary; + BlockedInvertedIndex m_inverted_index; + compact_vector m_docid_to_lexid; + + scored_string_pool m_pool; + completion_set m_topk_completion_set; + + void init() { + m_pool.clear(); + m_pool.init(); + assert(m_pool.size() == 0); + } + + void extract_completions(const uint32_t num_completions) { + auto const& topk_scores = m_pool.scores(); + auto& completions = m_topk_completion_set.completions(); + auto& sizes = m_topk_completion_set.sizes(); + for (uint32_t i = 0; i != num_completions; ++i) { + auto doc_id = topk_scores[i]; + auto lex_id = m_docid_to_lexid[doc_id]; + uint8_t size = m_completions.extract(lex_id, completions[i]); + sizes[i] = size; + } + } + + typedef typename BlockedInvertedIndex::block_type block_t; + + struct block_type_comparator { + bool operator()(block_t& l, block_t& r) { + return l.docs_iterator.operator*() > r.docs_iterator.operator*(); + } + }; + + uint32_t conjunctive_topk(completion_type& prefix, const range suffix, + const uint32_t k) { + auto& topk_scores = m_pool.scores(); + + typedef min_heap + min_priority_queue_type; + min_priority_queue_type q; + uint32_t current_block_id = m_inverted_index.block_id(suffix.begin); + uint32_t current_block_boundary = + m_inverted_index.block_boundary(current_block_id); + for (uint32_t i = suffix.begin; i != suffix.end; ++i) { + assert(i > 0); + if (i > current_block_boundary) { + q.push_back(m_inverted_index.block(current_block_id)); + current_block_id += 1; + current_block_boundary = + m_inverted_index.block_boundary(current_block_id); + } + } + q.push_back(m_inverted_index.block(current_block_id)); + q.make_heap(); + + uint32_t results = 0; + + auto check = [&](block_t& block, id_type doc_id) { + uint64_t pos = block.docs_iterator.position(); + assert(block.docs_iterator.access(pos) == doc_id); + uint64_t begin = block.offsets_iterator.access(pos); + uint64_t end = block.offsets_iterator.access(pos + 1); + assert(end > begin); + for (uint64_t i = begin; i != end; ++i) { + auto t = block.terms_iterator.access(i) + block.lower_bound; + if (t > suffix.end) break; + if (suffix.contains(t)) { + topk_scores[results++] = doc_id; + break; + } + } + }; + + if (prefix.size() == 0) { + while (!q.empty()) { + auto& z = q.top(); + auto doc_id = z.docs_iterator.operator*(); + check(z, doc_id); + if (results == k) return results; + z.docs_iterator.next(); + if (!z.docs_iterator.has_next()) q.pop(); + q.heapify(); + } + } else { + deduplicate(prefix); + auto it = m_inverted_index.intersection_iterator(prefix, suffix); + for (; it.has_next() and !q.empty(); ++it) { + auto doc_id = *it; + while (!q.empty()) { + auto& z = q.top(); + auto val = z.docs_iterator.operator*(); + if (val > doc_id) break; + if (val < doc_id) { + val = z.docs_iterator.next_geq(doc_id); + if (!z.docs_iterator.has_next()) { + q.pop(); + } else { + q.heapify(); + } + } else { + if (val == doc_id) { + check(z, doc_id); + if (results == k) return results; + } + break; + } + } + } + } + + return results; + } + + iterator_type extract_strings(const uint32_t num_completions) { + auto const& completions = m_topk_completion_set.completions(); + auto const& sizes = m_topk_completion_set.sizes(); + for (uint32_t i = 0; i != num_completions; ++i) { + auto const& c = completions[i]; + uint32_t size = sizes[i]; + uint64_t offset = m_pool.bytes(); + uint8_t* decoded = m_pool.data() + offset; + for (uint32_t j = 0; j != size; ++j) { + auto term_id = c[j]; + uint8_t len = m_dictionary.extract(term_id, decoded); + decoded += len; + offset += len; + if (j != size - 1) { + *decoded++ = ' '; + offset++; + } + } + m_pool.push_back_offset(offset); + } + assert(m_pool.size() == num_completions); + return m_pool.begin(); + } +}; + +} // namespace autocomplete \ No newline at end of file diff --git a/archive/include/autocomplete_common.hpp b/archive/include/autocomplete_common.hpp new file mode 100644 index 0000000..21d952b --- /dev/null +++ b/archive/include/autocomplete_common.hpp @@ -0,0 +1,72 @@ +#pragma once + +#include "util_types.hpp" +#include "min_heap.hpp" +#include "unsorted_list.hpp" +#include "minimal_docids.hpp" +#include "succinct_rmq/cartesian_tree.hpp" + +namespace autocomplete { + +typedef unsorted_list unsorted_list_type; + +template +bool parse(Dictionary const& dict, std::string const& query, + completion_type& prefix, byte_range& suffix, bool must_find_prefix) { + byte_range_iterator it(string_to_byte_range(query)); + while (true) { + suffix = it.next(); + if (!it.has_next()) break; + auto term_id = dict.locate(suffix); + if (term_id != global::invalid_term_id) { + prefix.push_back(term_id); + } else { + if (must_find_prefix) return false; + } + } + return true; +} + +void deduplicate(completion_type& c) { + std::sort(c.begin(), c.end()); + auto end = std::unique(c.begin(), c.end()); + c.resize(std::distance(c.begin(), end)); +} + +template +uint32_t heap_topk(InvertedIndex const& index, const range r, const uint32_t k, + std::vector& topk_scores) { + assert(r.is_valid()); + + typedef min_heap> + min_priority_queue_type; + + min_priority_queue_type q; + q.reserve(r.end - r.begin + 1); // inclusive range + assert(r.begin > 0); + for (uint64_t term_id = r.begin; term_id <= r.end; ++term_id) { + q.push_back(index.iterator(term_id - 1)); + } + q.make_heap(); + + uint32_t results = 0; + + while (!q.empty()) { + auto& z = q.top(); + auto doc_id = *z; + bool alread_present = std::binary_search( + topk_scores.begin(), topk_scores.begin() + results, doc_id); + if (!alread_present) { + topk_scores[results++] = doc_id; + if (results == k) return results; + } + z.next(); + if (!z.has_next()) q.pop(); + q.heapify(); + } + + return results; +} + +} // namespace autocomplete \ No newline at end of file diff --git a/include/bit_vector.hpp b/archive/include/bit_vector.hpp similarity index 98% rename from include/bit_vector.hpp rename to archive/include/bit_vector.hpp index 676c112..4afb7dd 100644 --- a/include/bit_vector.hpp +++ b/archive/include/bit_vector.hpp @@ -242,12 +242,6 @@ struct bit_vector { build(in); } - bit_vector& operator=(bit_vector const& other) { - bit_vector tmp(other); - tmp.swap(*this); - return *this; - } - void swap(bit_vector& other) { std::swap(other.m_size, m_size); other.m_bits.swap(m_bits); @@ -412,6 +406,7 @@ struct bits_getter { , m_base(offset) , m_width(width) , m_mask(-(width == 64) | ((uint64_t(1) << width) - 1)) { + assert(width > 0); util::prefetch(m_data + m_base / 64); } diff --git a/include/blocked_inverted_index.hpp b/archive/include/blocked_inverted_index.hpp similarity index 73% rename from include/blocked_inverted_index.hpp rename to archive/include/blocked_inverted_index.hpp index 79319fe..2f1af3a 100644 --- a/include/blocked_inverted_index.hpp +++ b/archive/include/blocked_inverted_index.hpp @@ -21,9 +21,12 @@ struct blocked_inverted_index { builder(parameters const& params, float c) : m_num_integers(0) - , m_num_docs(params.num_completions) + , m_num_docs(params.universe) , m_num_terms(params.num_terms) { - assert(c > 0.0); + if (!(c > 0.0 and c <= 1.0)) { + throw std::runtime_error("c must be in (0,1]"); + } + essentials::logger("building blocked_inverted_index with c = " + std::to_string(c) + "..."); @@ -115,6 +118,7 @@ struct blocked_inverted_index { auto max = *std::max_element(term_list.begin(), term_list.end()); uint64_t width = util::ceil_log2(max + 1); + if (width == 0) width = 1; // std::cout << "using " << width << " [bpi]" << std::endl; m_terms.append_bits(width, 6); for (auto t : term_list) m_terms.append_bits(t, width); @@ -248,6 +252,11 @@ struct blocked_inverted_index { return id; } + uint32_t block_boundary(uint32_t block_id) const { + assert(block_id < m_blocks.size()); + return m_blocks[block_id]; + } + struct block_type { docs_iterator_type docs_iterator; offsets_iterator_type offsets_iterator; @@ -263,61 +272,45 @@ struct blocked_inverted_index { : m_i(0) , m_num_docs(ii->num_docs()) , m_suffix(r) { - assert(!r.is_invalid()); - - if (!term_ids.empty()) { - m_iterators.reserve(term_ids.size()); // at most - std::sort(term_ids.begin(), term_ids.end()); - uint32_t current_block_id = ii->block_id(term_ids.front()); - uint32_t i = 0; - uint32_t prev_i = 0; - for (; i != term_ids.size(); ++i) { - auto term_id = term_ids[i]; - assert(term_id > 0); - uint32_t b = ii->block_id(term_id); - if (b > current_block_id) { - auto block = ii->block(current_block_id); - block.term_ids.reserve(term_ids.size()); // at most - for (; prev_i != i; ++prev_i) { - block.term_ids.push_back(term_ids[prev_i]); - } - m_iterators.push_back(std::move(block)); + assert(r.is_valid()); + assert(!term_ids.empty()); + assert(std::is_sorted(term_ids.begin(), term_ids.end())); + assert(std::unique(term_ids.begin(), term_ids.end()) == + term_ids.end()); + + m_blocks.reserve(term_ids.size()); // at most + uint32_t current_block_id = ii->block_id(term_ids.front()); + uint32_t i = 0; + uint32_t prev_i = 0; + for (; i != term_ids.size(); ++i) { + auto term_id = term_ids[i]; + assert(term_id > 0); + uint32_t b = ii->block_id(term_id); + if (b > current_block_id) { + auto block = ii->block(current_block_id); + block.term_ids.reserve(term_ids.size()); // at most + for (; prev_i != i; ++prev_i) { + block.term_ids.push_back(term_ids[prev_i]); } - current_block_id = b; + m_blocks.push_back(std::move(block)); } - - auto block = ii->block(current_block_id); - block.term_ids.reserve(term_ids.size()); // at most - for (; prev_i != i; ++prev_i) { - block.term_ids.push_back(term_ids[prev_i]); - } - m_iterators.push_back(std::move(block)); - - assert(m_iterators.size() > 0); - std::sort(m_iterators.begin(), m_iterators.end(), - [](auto const& l, auto const& r) { - return l.docs_iterator.size() < - r.docs_iterator.size(); - }); - - m_candidate = m_iterators[0].docs_iterator.access(0); - } else { - m_candidate = 0; + current_block_id = b; } - { - uint32_t current_block_id = ii->block_id(r.begin); - uint32_t i = r.begin; - for (; i != r.end; ++i) { - assert(i > 0); - uint32_t b = ii->block_id(i); - if (b > current_block_id) { - m_range.push_back(ii->block(current_block_id)); - } - current_block_id = b; - } - m_range.push_back(ii->block(current_block_id)); + auto block = ii->block(current_block_id); + block.term_ids.reserve(term_ids.size()); // at most + for (; prev_i != i; ++prev_i) { + block.term_ids.push_back(term_ids[prev_i]); } + m_blocks.push_back(std::move(block)); + + std::sort(m_blocks.begin(), m_blocks.end(), + [](auto const& l, auto const& r) { + return l.docs_iterator.size() < + r.docs_iterator.size(); + }); + + m_candidate = m_blocks[0].docs_iterator.access(0); next(); } @@ -331,62 +324,37 @@ struct blocked_inverted_index { } void operator++() { - assert(m_i == m_iterators.size()); - if (!m_iterators.empty()) { - if (m_iterators.size() > 1) { - m_candidate = m_iterators[0].docs_iterator.next(); - } - } else { - m_candidate += 1; + assert(m_i == m_blocks.size()); + if (m_blocks.size() > 1) { + m_candidate = m_blocks[0].docs_iterator.next(); } m_i = 0; next(); } - bool intersects() { - for (auto& block : m_range) { - uint64_t val = block.docs_iterator.next_geq(m_candidate); - if (val == m_candidate) { - uint64_t pos = block.docs_iterator.position(); - assert(block.docs_iterator.access(pos) == m_candidate); - uint64_t begin = block.offsets_iterator.access(pos); - uint64_t end = block.offsets_iterator.access(pos + 1); - assert(end > begin); - uint32_t lower_bound = block.lower_bound; - for (uint64_t i = begin; i != end; ++i) { - auto t = block.terms_iterator.access(i) + lower_bound; - if (t > m_suffix.end) break; - if (m_suffix.contains(t)) return true; - } - } - } - return false; - } - private: id_type m_candidate; size_t m_i; uint64_t m_num_docs; - std::vector m_iterators; + std::vector m_blocks; std::vector m_range; range m_suffix; bool in() { // is candidate doc in intersection? - uint64_t pos = m_iterators[m_i].docs_iterator.position(); - if (pos == m_iterators[m_i].docs_iterator.size()) return false; - uint64_t begin = m_iterators[m_i].offsets_iterator.access(pos); - uint64_t end = m_iterators[m_i].offsets_iterator.access(pos + 1); + auto& b = m_blocks[m_i]; + uint64_t pos = b.docs_iterator.position(); + if (pos == b.docs_iterator.size()) return false; + uint64_t begin = b.offsets_iterator.access(pos); + uint64_t end = b.offsets_iterator.access(pos + 1); assert(end > begin); - if (end - begin < m_iterators[m_i].term_ids.size()) return false; + if (end - begin < b.term_ids.size()) return false; uint64_t i = begin; - uint32_t lower_bound = m_iterators[m_i].lower_bound; - for (auto x : m_iterators[m_i].term_ids) { + for (auto x : b.term_ids) { bool found = false; for (; i != end; ++i) { - auto t = - m_iterators[m_i].terms_iterator.access(i) + lower_bound; + auto t = b.terms_iterator.access(i) + b.lower_bound; if (t == x) { found = true; break; @@ -399,18 +367,17 @@ struct blocked_inverted_index { } void next() { - if (m_iterators.empty()) return; - if (m_iterators.size() == 1) { - while (m_candidate < m_num_docs and m_i != m_iterators.size()) { + if (m_blocks.size() == 1) { + while (m_candidate < m_num_docs and m_i != m_blocks.size()) { assert(m_i == 0); - m_candidate = m_iterators[m_i].docs_iterator.next(); + m_candidate = m_blocks[m_i].docs_iterator.next(); if (in()) ++m_i; } } else { - while (m_candidate < m_num_docs and m_i != m_iterators.size()) { + while (m_candidate < m_num_docs and m_i != m_blocks.size()) { // NOTE: since we work with unions of posting lists, // next_geq by scan runs faster - auto val = m_iterators[m_i].docs_iterator.next_geq_by_scan( + auto val = m_blocks[m_i].docs_iterator.next_geq_by_scan( m_candidate); bool is_in = in(); if (val == m_candidate and is_in) { @@ -429,34 +396,6 @@ struct blocked_inverted_index { return intersection_iterator_type(this, term_ids, r); } - template - void visit(Visitor& visitor) { - visitor.visit(m_num_integers); - visitor.visit(m_num_docs); - visitor.visit(m_num_terms); - visitor.visit(m_blocks); - visitor.visit(m_pointers_to_lists); - visitor.visit(m_lists); - visitor.visit(m_pointers_to_offsets); - visitor.visit(m_offsets); - visitor.visit(m_pointers_to_terms); - visitor.visit(m_terms); - } - -private: - uint64_t m_num_integers; - uint64_t m_num_docs; - uint64_t m_num_terms; - - std::vector m_blocks; - - ef::ef_sequence m_pointers_to_lists; - bit_vector m_lists; - ef::ef_sequence m_pointers_to_offsets; - bit_vector m_offsets; - ef::ef_sequence m_pointers_to_terms; - bit_vector m_terms; - block_type block(uint32_t block_id) const { assert(block_id < num_blocks()); block_type b; @@ -485,6 +424,34 @@ struct blocked_inverted_index { return b; } + + template + void visit(Visitor& visitor) { + visitor.visit(m_num_integers); + visitor.visit(m_num_docs); + visitor.visit(m_num_terms); + visitor.visit(m_blocks); + visitor.visit(m_pointers_to_lists); + visitor.visit(m_lists); + visitor.visit(m_pointers_to_offsets); + visitor.visit(m_offsets); + visitor.visit(m_pointers_to_terms); + visitor.visit(m_terms); + } + +private: + uint64_t m_num_integers; + uint64_t m_num_docs; + uint64_t m_num_terms; + + std::vector m_blocks; + + ef::ef_sequence m_pointers_to_lists; + bit_vector m_lists; + ef::ef_sequence m_pointers_to_offsets; + bit_vector m_offsets; + ef::ef_sequence m_pointers_to_terms; + bit_vector m_terms; }; } // namespace autocomplete \ No newline at end of file diff --git a/archive/include/building_util.hpp b/archive/include/building_util.hpp new file mode 100644 index 0000000..0398879 --- /dev/null +++ b/archive/include/building_util.hpp @@ -0,0 +1,39 @@ +#pragma once + +#include "util.hpp" +#include "bit_vector.hpp" + +namespace autocomplete { +namespace util { + +std::vector invert(std::vector const& docid_to_lexid, + uint64_t size) { + std::vector lexid_to_docid(size); + for (uint64_t doc_id = 0; doc_id != docid_to_lexid.size(); ++doc_id) { + if (docid_to_lexid[doc_id] < size) { + lexid_to_docid[docid_to_lexid[doc_id]] = doc_id; + } + } + return lexid_to_docid; +} + +void push_pad(bit_vector_builder& bvb, uint64_t alignment = 8) { + uint64_t mod = bvb.size() % alignment; + if (mod) { + uint64_t pad = alignment - mod; + bvb.append_bits(0, pad); + assert(bvb.size() % alignment == 0); + } +} + +void eat_pad(bits_iterator& it, uint64_t alignment = 8) { + uint64_t mod = it.position() % alignment; + if (mod) { + uint64_t pad = alignment - mod; + it.get_bits(pad); + assert(it.position() % alignment == 0); + } +} + +} // namespace util +} // namespace autocomplete \ No newline at end of file diff --git a/include/compact_forward_index.hpp b/archive/include/compact_forward_index.hpp similarity index 92% rename from include/compact_forward_index.hpp rename to archive/include/compact_forward_index.hpp index 74ad769..50267f4 100644 --- a/include/compact_forward_index.hpp +++ b/archive/include/compact_forward_index.hpp @@ -14,25 +14,25 @@ struct compact_forward_index { : m_num_integers(0) , m_num_terms(params.num_terms) { essentials::logger("building forward_index..."); - uint64_t num_completions = params.num_completions; + uint64_t universe = params.universe; std::ifstream input( (params.collection_basename + ".forward").c_str(), std::ios_base::in); - - std::vector terms; - terms.reserve(params.num_completions * + std::vector terms; + terms.reserve(universe * constants::MAX_NUM_TERMS_PER_QUERY); // at most uint64_t size = 0; m_pointers.push_back(0); - for (uint64_t i = 0; i != num_completions; ++i) { + for (uint64_t i = 0; i != universe; ++i) { uint32_t n = 0; input >> n; - assert(n > 0 and n < constants::MAX_NUM_TERMS_PER_QUERY); + assert(n < constants::MAX_NUM_TERMS_PER_QUERY); m_num_integers += n; size += n; for (uint64_t k = 0; k != n; ++k) { id_type x; input >> x; + assert(x > 0); terms.push_back(x); } m_pointers.push_back(size); @@ -90,6 +90,7 @@ struct compact_forward_index { bool intersects(const range r) const { for (uint64_t i = 0; i != size(); ++i) { auto val = m_cv[m_base + i]; + assert(val > 0); if (r.contains(val)) return true; } return false; @@ -103,6 +104,7 @@ struct compact_forward_index { }; forward_list_iterator_type iterator(id_type doc_id) { + assert(doc_id < num_docs()); uint64_t pos = m_pointers.access(doc_id); uint64_t n = m_pointers.access(doc_id + 1) - pos; return {m_data, pos, n}; diff --git a/include/compact_vector.hpp b/archive/include/compact_vector.hpp similarity index 89% rename from include/compact_vector.hpp rename to archive/include/compact_vector.hpp index f0cd1bd..ac8e275 100644 --- a/include/compact_vector.hpp +++ b/archive/include/compact_vector.hpp @@ -73,26 +73,33 @@ struct compact_vector { }; struct builder { - builder(uint64_t n = 0, uint64_t w = 0) + builder() + : m_back(0) + , m_cur_block(0) + , m_cur_shift(0) {} + + builder(uint64_t n, uint64_t w) : m_size(n) - , m_width(!w ? w + 1 : w) + , m_width(w) , m_mask(-(w == 64) | ((1ULL << w) - 1)) , m_back(0) , m_cur_block(0) , m_cur_shift(0) , m_bits(essentials::words_for(m_size * m_width), 0) { - if (m_width > 64) { - throw std::runtime_error("width must be <= 64"); + if (m_width == 0 or m_width > 64) { + throw std::runtime_error("width must be > 0 and <= 64"); } } void resize(size_t n, uint64_t w) { m_size = n; - m_width = !w ? w + 1 : w; - if (m_width > 64) { - throw std::runtime_error("width must be <= 64"); + m_width = w; + if (m_width == 0 or m_width > 64) { + throw std::runtime_error("width must be > 0 and <= 64"); } m_mask = -(w == 64) | ((uint64_t(1) << w) - 1); + std::cout << "using " << essentials::words_for(m_size * m_width) + << " words" << std::endl; m_bits.resize(essentials::words_for(m_size * m_width), 0); } @@ -108,7 +115,7 @@ struct compact_vector { throw std::runtime_error("width must be greater than 0"); } - for (uint64_t i = 0; i < n; ++i, ++begin) { + for (uint64_t i = 0; i != n; ++i, ++begin) { push_back(*begin); } } @@ -220,8 +227,13 @@ struct compact_vector { void build(Iterator begin, uint64_t n) { uint64_t max = *std::max_element(begin, begin + n); uint64_t width = util::ceil_log2(max + 1); - std::cout << "\tusing " << width << " [bpi]" << std::endl; - compact_vector::builder builder(begin, n, width); + build(begin, n, width); + } + + template + void build(Iterator begin, uint64_t n, uint64_t w) { + std::cout << "\tusing " << w << " [bpi]" << std::endl; + compact_vector::builder builder(begin, n, w); builder.build(*this); } @@ -277,7 +289,7 @@ struct compact_vector { } uint64_t find(const range r, uint64_t id) { - assert(!r.is_invalid()); + assert(r.is_valid()); assert(r.end <= size()); return util::find(*this, id, r.begin, r.end - 1); } @@ -312,4 +324,5 @@ struct compact_vector { uint64_t m_mask; std::vector m_bits; }; + } // namespace autocomplete diff --git a/include/completion_trie.hpp b/archive/include/completion_trie.hpp similarity index 97% rename from include/completion_trie.hpp rename to archive/include/completion_trie.hpp index 8ae9036..2bc68ea 100644 --- a/include/completion_trie.hpp +++ b/archive/include/completion_trie.hpp @@ -166,16 +166,16 @@ struct completion_trie { completion_trie() {} // If the last token of the query is not completely specified, - // then we search for its lexicographic range among the children of c. + // then we search for its lexicographic range among the children of prefix. // Return [a,b) - range locate_prefix(completion_type const& c, + range locate_prefix(completion_type const& prefix, range suffix_lex_range) const { - range r{global::not_found, global::not_found}; + range r = global::invalid_range; range pointer{0, m_nodes.front().size()}; uint32_t i = 0; - for (; i < c.size(); ++i) { - uint64_t pos = m_nodes[i].find(pointer, c[i]); + for (; i < prefix.size(); ++i) { + uint64_t pos = m_nodes[i].find(pointer, prefix[i]); if (pos == global::not_found) return global::invalid_range; pointer = m_pointers[i][pos]; } @@ -195,10 +195,11 @@ struct completion_trie { r.end += size; } - assert(r.end > r.begin); + assert(r.is_valid()); return r; } + // NOTE: not used bool is_member(completion_type const& c) const { assert(c.size() > 0); range pointer{0, m_nodes.front().size()}; diff --git a/include/constants.hpp b/archive/include/constants.hpp similarity index 100% rename from include/constants.hpp rename to archive/include/constants.hpp diff --git a/include/ef/compact_ef.hpp b/archive/include/ef/compact_ef.hpp similarity index 100% rename from include/ef/compact_ef.hpp rename to archive/include/ef/compact_ef.hpp diff --git a/include/ef/darray.hpp b/archive/include/ef/darray.hpp similarity index 100% rename from include/ef/darray.hpp rename to archive/include/ef/darray.hpp diff --git a/include/ef/ef_parameters.hpp b/archive/include/ef/ef_parameters.hpp similarity index 100% rename from include/ef/ef_parameters.hpp rename to archive/include/ef/ef_parameters.hpp diff --git a/include/ef/ef_sequence.hpp b/archive/include/ef/ef_sequence.hpp similarity index 93% rename from include/ef/ef_sequence.hpp rename to archive/include/ef/ef_sequence.hpp index 10970d6..2e9e293 100644 --- a/include/ef/ef_sequence.hpp +++ b/archive/include/ef/ef_sequence.hpp @@ -49,6 +49,7 @@ struct ef_sequence { ++within; } assert(values.size() == n); + assert(std::is_sorted(values.begin(), values.end())); compress(values.begin(), values.size(), values.back()); } @@ -142,33 +143,25 @@ struct ef_sequence { } uint64_t find(const range r, uint64_t id) const { - assert(!r.is_invalid()); + assert(r.is_valid()); assert(r.end <= size()); uint64_t prev_upper = previous_range_upperbound(r); return util::find(*this, id + prev_upper, r.begin, r.end - 1); } range find(const range r, const range lex) const { - assert(!r.is_invalid()); + assert(r.is_valid()); assert(r.end <= size()); auto prev_upper = previous_range_upperbound(r); - - uint64_t begin = - util::next_geq(*this, lex.begin + prev_upper, r.begin, r.end - 1); - if (begin == global::not_found) { + uint64_t id_begin = lex.begin + prev_upper; + uint64_t id_end = lex.end + prev_upper; + uint64_t begin = util::next_geq(*this, id_begin, r.begin, r.end - 1); + if (begin == global::not_found or access(begin) > id_end) { return {r.end, r.end}; } - - if (lex.begin == lex.end) { - return {begin, begin + 1}; - } - - uint64_t id_end = lex.end + prev_upper; + if (lex.begin == lex.end) return {begin, begin + 1}; uint64_t end = util::next_geq(*this, id_end, begin, r.end - 1); - if (end == global::not_found) { - return {begin, r.end}; - } - + if (end == global::not_found) return {begin, r.end}; return {begin, access(end) != id_end ? end : end + 1}; } @@ -251,7 +244,7 @@ struct ef_sequence { } uint64_t previous_range_upperbound(const range r) const { - assert(!r.is_invalid()); + assert(r.is_valid()); return r.begin ? access(r.begin - 1) : 0; } }; diff --git a/include/fc_dictionary.hpp b/archive/include/fc_dictionary.hpp similarity index 95% rename from include/fc_dictionary.hpp rename to archive/include/fc_dictionary.hpp index 271f970..52e3971 100644 --- a/include/fc_dictionary.hpp +++ b/archive/include/fc_dictionary.hpp @@ -37,14 +37,17 @@ struct fc_dictionary { std::string curr; std::string header; + uint64_t total_characters = 0; for (uint32_t b = 0; b != buckets; ++b) { input >> header; + total_characters += header.size(); write_header(header); m_pointers_to_headers.push_back(m_headers.size()); prev.swap(header); uint32_t size = b != buckets - 1 ? BucketSize : tail; for (uint32_t i = 0; i != size; ++i) { input >> curr; + total_characters += curr.size(); uint32_t l = 0; // |lcp(curr,prev)| while (l != curr.size() and l != prev.size() and curr[l] == prev[l]) { @@ -61,6 +64,9 @@ struct fc_dictionary { m_buckets.push_back(0); } + std::cout << static_cast(total_characters) / m_size + << " characters per string" << std::endl; + input.close(); essentials::logger("DONE"); } @@ -109,6 +115,7 @@ struct fc_dictionary { fc_dictionary() {} // NOTE: return inclusive ranges, i.e., [a,b] + // 0-based ids range locate_prefix(byte_range p) const { if (p.end - p.begin == 0) return {0, size() - 1}; auto bucket_id = locate_buckets(p); @@ -223,7 +230,7 @@ struct fc_dictionary { if (cmp < 0) { bucket_id = mi; } else { - bucket_id = mi - 1; + bucket_id = hi == -1 ? 0 : hi; h = header(bucket_id); } @@ -307,10 +314,13 @@ struct fc_dictionary { // NOTE 1: excluding null terminators, allow us to use memcpy here // because we know exactly how many bytes to copy: this is much faster - // than looping until we hit '\0'. NOTE 2: always copying a fixed amount + // than looping until we hit '\0'. + + // NOTE 2: always copying a fixed amount // of bytes (constants::MAX_NUM_CHARS_PER_QUERY) is much faster than // copying an exact amount, e.g., suffix_len (althoung it could be // less), so do not do: memcpy(out+ l, in, suffix_len). + memcpy(out + l, in, constants::MAX_NUM_CHARS_PER_QUERY); return l + suffix_len; @@ -340,8 +350,7 @@ struct fc_dictionary { if (cmp < 0) return global::invalid_term_id; curr += l - lcp_len + 2; } - assert(false); - __builtin_unreachable(); + return global::invalid_term_id; // term does not exist in dictionary } id_type left_locate(byte_range p, byte_range h, id_type bucket_id) const { diff --git a/include/integer_codes.hpp b/archive/include/integer_codes.hpp similarity index 100% rename from include/integer_codes.hpp rename to archive/include/integer_codes.hpp diff --git a/include/integer_fc_dictionary.hpp b/archive/include/integer_fc_dictionary.hpp similarity index 89% rename from include/integer_fc_dictionary.hpp rename to archive/include/integer_fc_dictionary.hpp index 218cacf..29d8743 100644 --- a/include/integer_fc_dictionary.hpp +++ b/archive/include/integer_fc_dictionary.hpp @@ -19,7 +19,7 @@ struct integer_fc_dictionary { essentials::logger( "building integer_fc_dictionary with bucket size " + std::to_string(BucketSize) + "..."); - m_doc_ids.reserve(params.num_completions); + m_docid_to_lexid.resize(params.universe, id_type(-1)); uint32_t buckets = std::ceil(double(m_size) / (BucketSize + 1)); m_pointers_to_buckets.reserve(buckets + 1); @@ -35,9 +35,10 @@ struct integer_fc_dictionary { std::ios_base::in); completion_iterator it(params, input); + id_type lex_id = 0; for (uint32_t b = 0; b != buckets; ++b) { auto& header = *it; - m_doc_ids.push_back(header.doc_id); + m_docid_to_lexid[header.doc_id] = lex_id++; write_header(header.completion); m_pointers_to_headers.push_back(m_headers.size()); completion_type prev; @@ -47,7 +48,7 @@ struct integer_fc_dictionary { for (uint32_t i = 0; i != size; ++i, ++it) { auto& record = *it; auto& curr = record.completion; - m_doc_ids.push_back(record.doc_id); + m_docid_to_lexid[record.doc_id] = lex_id++; uint32_t l = 0; // |lcp(curr,prev)| while (l != curr.size() and l != prev.size() and curr[l] == prev[l]) { @@ -76,7 +77,7 @@ struct integer_fc_dictionary { other.m_pointers_to_buckets.swap(m_pointers_to_buckets); other.m_headers.swap(m_headers); other.m_buckets.swap(m_buckets); - other.m_doc_ids.swap(m_doc_ids); + other.m_docid_to_lexid.swap(m_docid_to_lexid); } void build(integer_fc_dictionary& d) { @@ -88,8 +89,8 @@ struct integer_fc_dictionary { builder().swap(*this); } - std::vector& doc_ids() { - return m_doc_ids; + std::vector& docid_to_lexid() { + return m_docid_to_lexid; } private: @@ -98,7 +99,7 @@ struct integer_fc_dictionary { std::vector m_pointers_to_buckets; std::vector m_headers; std::vector m_buckets; - std::vector m_doc_ids; + std::vector m_docid_to_lexid; void write_header(completion_type const& c) { assert(c.size() > 0 and @@ -166,19 +167,20 @@ struct integer_fc_dictionary { prefix.push_back(global::invalid_term_id); } - locate_bucket(completion_to_uint32_range(prefix), h_end, bucket_id_end, - bucket_id_begin // hint + locate_right_bucket(completion_to_uint32_range(prefix), h_end, + bucket_id_end, + bucket_id_begin // hint ); uint32_t p_end = bucket_id_end * (BucketSize + 1); p_end += right_locate(completion_to_uint32_range(prefix), h_end, bucket_id_end); + prefix.pop_back(); + if (p_end < p_begin) { - prefix.pop_back(); return global::invalid_range; } - prefix.pop_back(); if (suffix_lex_range.begin == suffix_lex_range.end) { prefix.pop_back(); } @@ -269,13 +271,37 @@ struct integer_fc_dictionary { if (cmp < 0) { bucket_id = mi; } else { - bucket_id = mi - 1; + bucket_id = hi == -1 ? 0 : hi; h = header(bucket_id); } return false; } + void locate_right_bucket(uint32_range t, uint32_range& h, + id_type& bucket_id, + int lower_bound_hint = 0) const { + int lo = lower_bound_hint, hi = buckets() - 1, mi = 0, cmp = 0; + size_t n = t.end - t.begin; + while (lo <= hi) { + mi = (lo + hi) / 2; + h = header(mi); + cmp = uint32_range_compare(h, t, n); + if (cmp > 0) { + hi = mi - 1; + } else if (cmp <= 0) { + lo = mi + 1; + } + } + + if (cmp < 0) { + bucket_id = mi; + } else { + bucket_id = hi == -1 ? 0 : hi; + h = header(bucket_id); + } + } + #define INT_FC_DICT_LOCATE_INIT \ static uint32_t decoded[2 * constants::MAX_NUM_TERMS_PER_QUERY]; \ memcpy(decoded, h.begin, (h.end - h.begin) * sizeof(uint32_t)); \ diff --git a/include/inverted_index.hpp b/archive/include/inverted_index.hpp similarity index 88% rename from include/inverted_index.hpp rename to archive/include/inverted_index.hpp index 7c84bd7..900fd96 100644 --- a/include/inverted_index.hpp +++ b/archive/include/inverted_index.hpp @@ -16,7 +16,7 @@ struct inverted_index { builder(parameters const& params) : m_num_integers(0) - , m_num_docs(params.num_completions) { + , m_num_docs(params.universe) { essentials::logger("building inverted_index..."); uint64_t num_terms = params.num_terms; @@ -28,10 +28,18 @@ struct inverted_index { std::vector list; m_pointers.push_back(0); + + uint32_t max_list_size = 0; + uint32_t min_list_size = uint32_t(-1); + for (uint64_t i = 0; i != num_terms; ++i) { list.clear(); uint32_t n = 0; input >> n; + + if (n > max_list_size) max_list_size = n; + if (n < min_list_size) min_list_size = n; + list.reserve(n); m_num_integers += n; for (uint64_t k = 0; k != n; ++k) { @@ -41,11 +49,17 @@ struct inverted_index { } m_minimal_doc_ids.push_back(list.front()); write_gamma_nonzero(m_bvb, n); - if (ListType::is_byte_aligned) util::push_pad(m_bvb); + if constexpr (ListType::is_byte_aligned) util::push_pad(m_bvb); ListType::build(m_bvb, list.begin(), m_num_docs, list.size()); m_pointers.push_back(m_bvb.size()); } + std::cout << "avg. list size = " + << static_cast(m_num_integers) / num_terms + << std::endl; + std::cout << "max_list_size = " << max_list_size << std::endl; + std::cout << "min_list_size = " << min_list_size << std::endl; + m_pointers.pop_back(); input.close(); essentials::logger("DONE"); @@ -86,7 +100,7 @@ struct inverted_index { uint64_t offset = m_pointers.access(term_id); bits_iterator it(m_data, offset); uint64_t n = read_gamma_nonzero(it); - if (ListType::is_byte_aligned) util::eat_pad(it); + if constexpr (ListType::is_byte_aligned) util::eat_pad(it); return {m_data, it.position(), m_num_docs, n}; } diff --git a/include/min_heap.hpp b/archive/include/min_heap.hpp similarity index 100% rename from include/min_heap.hpp rename to archive/include/min_heap.hpp diff --git a/archive/include/minimal_docids.hpp b/archive/include/minimal_docids.hpp new file mode 100644 index 0000000..a7cb8f8 --- /dev/null +++ b/archive/include/minimal_docids.hpp @@ -0,0 +1,131 @@ +#pragma once + +#include "compact_vector.hpp" +#include "util_types.hpp" + +namespace autocomplete { + +template +struct minimal_docids { + static const uint32_t SCAN_THRESHOLD = 64; + typedef scored_range_with_list_iterator< + typename InvertedIndex::iterator_type> + range_type; + typedef scored_range_with_list_iterator_comparator< + typename range_type::iterator_type> + comparator_range_type; + + minimal_docids() {} + + void build(std::vector const& list) { + essentials::logger("building minimal_docids..."); + m_rmq.build(list, std::less()); + m_list.build(list.begin(), list.size()); + essentials::logger("DONE"); + } + + uint32_t topk(InvertedIndex const& index, const range r, const uint32_t k, + std::vector& topk_scores) { + range_type sr; + sr.r = {r.begin, r.end - 1}; // rmq needs inclusive ranges + sr.min_pos = m_rmq.rmq(sr.r.begin, sr.r.end); + sr.min_val = m_list.access(sr.min_pos); + + m_q.clear(); + m_q.push(sr); + + uint32_t results = 0; + while (!m_q.empty()) { + auto& min = m_q.top(); + auto docid = min.minimum(); + bool alread_present = std::binary_search( + topk_scores.begin(), topk_scores.begin() + results, docid); + if (!alread_present) { + topk_scores[results++] = docid; + if (results == k) break; + } + + if (min.is_open()) { + min.iterator.next(); + if (!min.iterator.has_next()) { + m_q.pop(); + } + m_q.heapify(); + } else { + // save + auto min_range = min.r; + auto min_pos = min.min_pos; + + min.set_iterator(index); + min.iterator.next(); + if (!min.iterator.has_next()) { + m_q.pop(); + } + + m_q.heapify(); + + if (min_pos > 0 and min_pos - 1 >= min_range.begin) { + range_type left; + left.r = {min_range.begin, min_pos - 1}; + if (left.r.end - left.r.begin <= SCAN_THRESHOLD) { + left.min_pos = rmq(left.r.begin, left.r.end); + } else { + left.min_pos = m_rmq.rmq(left.r.begin, left.r.end); + } + left.min_val = m_list.access(left.min_pos); + m_q.push(left); + } + + if (min_pos < size() - 1 and min_range.end >= min_pos + 1) { + range_type right; + right.r = {min_pos + 1, min_range.end}; + if (right.r.end - right.r.begin <= SCAN_THRESHOLD) { + right.min_pos = rmq(right.r.begin, right.r.end); + } else { + right.min_pos = m_rmq.rmq(right.r.begin, right.r.end); + } + right.min_val = m_list.access(right.min_pos); + m_q.push(right); + } + } + } + + return results; + } + + size_t size() const { + return m_list.size(); + } + + size_t bytes() const { + return m_rmq.bytes() + m_list.bytes(); + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_rmq); + visitor.visit(m_list); + } + +private: + typedef min_heap min_priority_queue_type; + min_priority_queue_type m_q; + + RMQ m_rmq; + compact_vector m_list; + + uint64_t rmq(uint64_t lo, uint64_t hi) { // inclusive endpoints + uint64_t pos = lo; + id_type min = id_type(-1); + for (uint64_t i = lo; i <= hi; ++i) { + id_type val = m_list.access(i); + if (val < min) { + min = val; + pos = i; + } + } + return pos; + } +}; + +} // namespace autocomplete \ No newline at end of file diff --git a/include/parameters.hpp b/archive/include/parameters.hpp similarity index 81% rename from include/parameters.hpp rename to archive/include/parameters.hpp index db44d71..d628d25 100644 --- a/include/parameters.hpp +++ b/archive/include/parameters.hpp @@ -24,10 +24,12 @@ struct parameters { input >> num_terms; input >> max_string_length; input >> num_completions; + input >> universe; input >> num_levels; assert(num_terms > 0); assert(max_string_length > 0); assert(num_completions > 0); + assert(universe >= num_completions); assert(num_levels > 0); if (max_string_length > constants::MAX_NUM_CHARS_PER_QUERY) { @@ -41,14 +43,18 @@ struct parameters { } nodes_per_level.resize(num_levels, 0); - for (uint32_t i = 0; i != num_levels; ++i) { - input >> nodes_per_level[i]; + uint32_t i = 0; + for (; i != num_levels and input; ++i) input >> nodes_per_level[i]; + if (i != num_levels) { + throw std::runtime_error( + "File with statistics may be truncated or malformed"); } } uint32_t num_terms; uint32_t max_string_length; uint32_t num_completions; + uint32_t universe; uint32_t num_levels; std::vector nodes_per_level; std::string collection_basename; diff --git a/archive/include/probe.hpp b/archive/include/probe.hpp new file mode 100644 index 0000000..955a939 --- /dev/null +++ b/archive/include/probe.hpp @@ -0,0 +1,36 @@ +#pragma once + +#include +#include "util_types.hpp" + +namespace autocomplete { + +struct nop_probe { + inline void start(uint64_t) {} + inline void stop(uint64_t) {} +}; + +struct timer_probe { + timer_probe(uint64_t n) + : m_timers(n) {} + + inline void start(uint64_t i) { + assert(i < m_timers.size()); + m_timers[i].start(); + } + + inline void stop(uint64_t i) { + assert(i < m_timers.size()); + m_timers[i].stop(); + } + + timer_type const& get(uint64_t i) { + assert(i < m_timers.size()); + return m_timers[i]; + } + +private: + std::vector m_timers; +}; + +} // namespace autocomplete diff --git a/include/scored_string_pool.hpp b/archive/include/scored_string_pool.hpp similarity index 87% rename from include/scored_string_pool.hpp rename to archive/include/scored_string_pool.hpp index f834453..3f03f06 100644 --- a/include/scored_string_pool.hpp +++ b/archive/include/scored_string_pool.hpp @@ -4,6 +4,11 @@ namespace autocomplete { +struct scored_byte_range { + byte_range string; + id_type score; +}; + struct scored_string_pool { void init() { push_back_offset(0); @@ -39,6 +44,10 @@ struct scored_string_pool { return m_scores; } + std::vector const& const_scores() const { + return m_scores; + } + scored_byte_range operator[](size_t i) const { assert(i < size()); scored_byte_range sbr; @@ -69,6 +78,10 @@ struct scored_string_pool { return m_pool->operator[](m_pos); } + scored_string_pool const* pool() const { + return m_pool; + } + private: scored_string_pool const* m_pool; size_t m_pos; diff --git a/include/statistics.hpp b/archive/include/statistics.hpp similarity index 81% rename from include/statistics.hpp rename to archive/include/statistics.hpp index a863814..42654ae 100644 --- a/include/statistics.hpp +++ b/archive/include/statistics.hpp @@ -10,7 +10,8 @@ namespace autocomplete { void print(std::string const& what, size_t bytes, size_t total_bytes, uint64_t num_completions) { - std::cout << " " << what << ": " << convert(bytes, essentials::MiB) + std::cout << " " << what << ": " + << essentials::convert(bytes, essentials::MiB) << " [MiB]: " << static_cast(bytes) / num_completions << " [bytes per completion] "; std::cout << "(" << (bytes * 100.0) / total_bytes << "%)" << std::endl; @@ -31,20 +32,21 @@ template ::print_stats() const { size_t total_bytes = bytes(); - std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]" - << std::endl; + std::cout << "using " << essentials::convert(total_bytes, essentials::MiB) + << " [MiB]" << std::endl; print_bps("nodes", nodes_bytes(), size()); print_bps("pointers", pointers_bytes(), size()); print_bps("left extremes", left_extremes_bytes(), size()); print_bps("sizes", sizes_bytes(), size()); } -template -void autocomplete +void autocomplete::print_stats() const { size_t total_bytes = bytes(); - std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]: " + std::cout << "using " << essentials::convert(total_bytes, essentials::MiB) + << " [MiB]: " << static_cast(total_bytes) / m_completions.size() << " [bytes per completion] " << std::endl; @@ -74,18 +76,22 @@ void autocomplete(m_forward_index.num_integers()) / + m_completions.size() + << std::endl; print_bpi("data", m_forward_index.data_bytes(), m_forward_index.num_integers()); print_bpi("pointers", m_forward_index.pointer_bytes(), m_forward_index.num_integers()); } -template -void autocomplete2::print_stats() const { +template +void autocomplete2::print_stats() + const { size_t total_bytes = bytes(); - std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]: " + std::cout << "using " << essentials::convert(total_bytes, essentials::MiB) + << " [MiB]: " << static_cast(total_bytes) / m_completions.size() << " [bytes per completion] " << std::endl; @@ -115,12 +121,12 @@ void autocomplete2 -void autocomplete3::print_stats() const { +template +void autocomplete3::print_stats() + const { size_t total_bytes = bytes(); - std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]: " + std::cout << "using " << essentials::convert(total_bytes, essentials::MiB) + << " [MiB]: " << static_cast(total_bytes) / m_completions.size() << " [bytes per completion] " << std::endl; @@ -140,12 +146,13 @@ void autocomplete3 -void autocomplete4::print_stats() const { +void autocomplete4::print_stats() + const { size_t total_bytes = bytes(); - std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]: " + std::cout << "using " << essentials::convert(total_bytes, essentials::MiB) + << " [MiB]: " << static_cast(total_bytes) / m_completions.size() << " [bytes per completion] " << std::endl; diff --git a/include/succinct_rmq/README.md b/archive/include/succinct_rmq/README.md similarity index 100% rename from include/succinct_rmq/README.md rename to archive/include/succinct_rmq/README.md diff --git a/include/succinct_rmq/bp_vector.hpp b/archive/include/succinct_rmq/bp_vector.hpp similarity index 100% rename from include/succinct_rmq/bp_vector.hpp rename to archive/include/succinct_rmq/bp_vector.hpp diff --git a/include/succinct_rmq/bp_vector_support.hpp b/archive/include/succinct_rmq/bp_vector_support.hpp similarity index 100% rename from include/succinct_rmq/bp_vector_support.hpp rename to archive/include/succinct_rmq/bp_vector_support.hpp diff --git a/include/succinct_rmq/cartesian_tree.hpp b/archive/include/succinct_rmq/cartesian_tree.hpp similarity index 100% rename from include/succinct_rmq/cartesian_tree.hpp rename to archive/include/succinct_rmq/cartesian_tree.hpp diff --git a/include/succinct_rmq/rs_bit_vector.hpp b/archive/include/succinct_rmq/rs_bit_vector.hpp similarity index 100% rename from include/succinct_rmq/rs_bit_vector.hpp rename to archive/include/succinct_rmq/rs_bit_vector.hpp diff --git a/archive/include/types.hpp b/archive/include/types.hpp new file mode 100644 index 0000000..659199d --- /dev/null +++ b/archive/include/types.hpp @@ -0,0 +1,47 @@ +#pragma once + +#include "completion_trie.hpp" +#include "fc_dictionary.hpp" +#include "integer_fc_dictionary.hpp" +#include "compact_forward_index.hpp" +#include "inverted_index.hpp" +#include "blocked_inverted_index.hpp" +#include "autocomplete.hpp" +#include "autocomplete2.hpp" +#include "autocomplete3.hpp" +#include "autocomplete4.hpp" +#include "compact_vector.hpp" +#include "ef/ef_sequence.hpp" +#include "ef/compact_ef.hpp" + +namespace autocomplete { + +typedef uint_vec uint32_vec; +typedef uint_vec uint64_vec; + +typedef completion_trie + ef_completion_trie; +typedef fc_dictionary<> fc_dictionary_type; +typedef integer_fc_dictionary<> integer_fc_dictionary_type; +typedef inverted_index ef_inverted_index; +typedef blocked_inverted_index ef_blocked_inverted_index; + +/* compressed indexes */ +typedef autocomplete + ef_autocomplete_type1; + +typedef autocomplete2 + ef_autocomplete_type2; + +typedef autocomplete3 + ef_autocomplete_type3; + +typedef autocomplete4 + ef_autocomplete_type4; + +} // namespace autocomplete \ No newline at end of file diff --git a/include/uint_vec.hpp b/archive/include/uint_vec.hpp similarity index 94% rename from include/uint_vec.hpp rename to archive/include/uint_vec.hpp index 86d60c4..adeaa8c 100644 --- a/include/uint_vec.hpp +++ b/archive/include/uint_vec.hpp @@ -74,14 +74,14 @@ struct uint_vec { } uint64_t find(const range r, UintType id) const { - assert(!r.is_invalid()); + assert(r.is_valid()); assert(r.end <= size()); - UintType prev_upper = previous_range_upperbound(r); + auto prev_upper = previous_range_upperbound(r); return util::find(*this, id + prev_upper, r.begin, r.end - 1); } range find(const range r, const range lex) const { - assert(!r.is_invalid()); + assert(r.is_valid()); assert(r.end <= size()); auto prev_upper = previous_range_upperbound(r); @@ -131,9 +131,9 @@ struct uint_vec { std::vector m_data; UintType previous_range_upperbound(const range r) const { - assert(!r.is_invalid()); + assert(r.is_valid()); return r.begin ? access(r.begin - 1) : 0; } -}; // namespace autocomplete +}; } // namespace autocomplete \ No newline at end of file diff --git a/include/uncompressed_list.hpp b/archive/include/uncompressed_list.hpp similarity index 100% rename from include/uncompressed_list.hpp rename to archive/include/uncompressed_list.hpp diff --git a/include/unsorted_list.hpp b/archive/include/unsorted_list.hpp similarity index 78% rename from include/unsorted_list.hpp rename to archive/include/unsorted_list.hpp index e7cfddd..bb06a86 100644 --- a/include/unsorted_list.hpp +++ b/archive/include/unsorted_list.hpp @@ -1,48 +1,10 @@ #pragma once #include "compact_vector.hpp" +#include "util_types.hpp" namespace autocomplete { -struct scored_byte_range { - byte_range string; - id_type score; -}; - -typedef std::function - scored_range_comparator_type; -scored_range_comparator_type scored_range_comparator = - [](scored_range const& l, scored_range const& r) { - return l.min_val > r.min_val; - }; - -struct topk_queue { - void push(scored_range sr) { - m_q.push_back(sr); - std::push_heap(m_q.begin(), m_q.end(), scored_range_comparator); - } - - scored_range top() { - return m_q.front(); - } - - void pop() { - std::pop_heap(m_q.begin(), m_q.end(), scored_range_comparator); - m_q.pop_back(); - } - - void clear() { - m_q.clear(); - } - - bool empty() const { - return m_q.empty(); - } - -private: - std::vector m_q; -}; - template struct unsorted_list { static const uint32_t SCAN_THRESHOLD = 64; @@ -132,6 +94,40 @@ struct unsorted_list { } private: + struct topk_queue { + void push(scored_range sr) { + m_q.push_back(sr); + std::push_heap(m_q.begin(), m_q.end(), m_comparator); + } + + scored_range top() { + return m_q.front(); + } + + void pop() { + std::pop_heap(m_q.begin(), m_q.end(), m_comparator); + m_q.pop_back(); + } + + void clear() { + m_q.clear(); + } + + bool empty() const { + return m_q.empty(); + } + + private: + std::vector m_q; + + typedef std::function + scrored_range_comparator_type; + scrored_range_comparator_type m_comparator = [](scored_range const& l, + scored_range const& r) { + return scored_range::greater(l, r); + }; + }; + topk_queue m_q; RMQ m_rmq; compact_vector m_list; diff --git a/include/util.hpp b/archive/include/util.hpp similarity index 99% rename from include/util.hpp rename to archive/include/util.hpp index bb20bdb..4f0b89e 100644 --- a/include/util.hpp +++ b/archive/include/util.hpp @@ -51,6 +51,7 @@ uint64_t find(S const& sequence, uint64_t id, uint64_t lo, uint64_t hi) { if (val == id) { return pos; } else if (val > id) { + if (pos == 0) return global::not_found; hi = pos - 1; } else { lo = pos + 1; diff --git a/include/util_types.hpp b/archive/include/util_types.hpp similarity index 81% rename from include/util_types.hpp rename to archive/include/util_types.hpp index 7405378..0890002 100644 --- a/include/util_types.hpp +++ b/archive/include/util_types.hpp @@ -36,6 +36,7 @@ struct range { uint64_t begin; uint64_t end; bool is_invalid() const; + bool is_valid() const; bool contains(uint64_t val) const; }; @@ -48,6 +49,10 @@ bool range::is_invalid() const { end == global::invalid_range.end or begin > end; } +bool range::is_valid() const { + return !is_invalid(); +} + bool range::contains(uint64_t val) const { if (val >= begin and val <= end) return true; return false; @@ -57,6 +62,55 @@ struct scored_range { range r; uint32_t min_pos; id_type min_val; + + static bool greater(scored_range const& l, scored_range const& r) { + return l.min_val > r.min_val; + } +}; + +template +struct scored_range_with_list_iterator { + typedef Iterator iterator_type; + + scored_range_with_list_iterator() + : min_pos(global::invalid_term_id) + , m_open(false) {} + + range r; + uint32_t min_pos; + id_type min_val; + Iterator iterator; + + bool is_open() const { + return m_open; + } + + template + void set_iterator(InvertedIndex const& index) { + assert(min_pos != global::invalid_term_id); + m_open = true; + iterator = index.iterator(min_pos); + } + + id_type minimum() const { + return is_open() ? *iterator : min_val; + } + + // static bool greater(scored_range_with_list_iterator const& l, + // scored_range_with_list_iterator const& r) { + // return l.minimum() > r.minimum(); + // } + +private: + bool m_open; +}; + +template +struct scored_range_with_list_iterator_comparator { + bool operator()(scored_range_with_list_iterator const& l, + scored_range_with_list_iterator const& r) { + return l.minimum() > r.minimum(); + } }; struct byte_range { @@ -237,25 +291,4 @@ struct timer { typedef timer timer_type; -struct iterator { - iterator(id_type begin, id_type end) - : m_begin(begin) - , m_end(end) {} - - bool has_next() const { - return m_begin < m_end; - } - - id_type operator*() const { - return m_begin; - } - - void operator++() { - ++m_begin; - } - -private: - id_type m_begin, m_end; -}; - } // namespace autocomplete diff --git a/archive/install.sh b/archive/install.sh new file mode 100644 index 0000000..7714147 --- /dev/null +++ b/archive/install.sh @@ -0,0 +1,11 @@ +git submodule init +git submodule update +mkdir -p build +cd build +cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On +make +cd ../test_data +bash preprocess.sh trec_05_efficiency_queries/trec_05_efficiency_queries.completions 300 +cd ../build +make test +cd .. diff --git a/archive/script/benchmark_dictionaries.sh b/archive/script/benchmark_dictionaries.sh new file mode 100644 index 0000000..29c9a84 --- /dev/null +++ b/archive/script/benchmark_dictionaries.sh @@ -0,0 +1,7 @@ +cd ../test_data +bash preprocess.sh aol/aol.completions 100000 +cd ../build +python ../script/collect_locate_prefix_results_by_varying_percentage.py fc ../test_data/aol/aol.completions 100000 +python ../script/collect_locate_prefix_results_by_varying_percentage.py trie ../test_data/aol/aol.completions 100000 +./benchmark_fc_dictionary ../test_data/aol/aol.completions 100000 < ../test_data/aol/aol.completions.queries/queries.length=1 > ../test_data/aol/aol.completions.dictionary_benchmark.txt +cd ../script \ No newline at end of file diff --git a/archive/script/build_indexes.py b/archive/script/build_indexes.py new file mode 100644 index 0000000..e01e1db --- /dev/null +++ b/archive/script/build_indexes.py @@ -0,0 +1,6 @@ +import sys, os + +dataset_name = sys.argv[1] # e.g., aol +types = ["ef_type1", "ef_type2", "ef_type3", "ef_type4"] +for t in types: + os.system("./build " + t + " ../test_data/" + dataset_name + "/" + dataset_name + ".completions -o " + t + "." + dataset_name + ".bin -c 0.0001") \ No newline at end of file diff --git a/archive/script/collect_effectiveness_results_by_varying_percentage.py b/archive/script/collect_effectiveness_results_by_varying_percentage.py new file mode 100644 index 0000000..2693e70 --- /dev/null +++ b/archive/script/collect_effectiveness_results_by_varying_percentage.py @@ -0,0 +1,17 @@ +import sys, os + +index_type = sys.argv[1] +index_filename = sys.argv[2] +collection_basename = sys.argv[3] # e.g., aol/aol.completions or aol/aol.completions.filtered +k = sys.argv[4] +num_queries = sys.argv[5] + +output_filename = collection_basename + "." + index_type +output_filename += ".effectiveness.json" +query_filename_prefix = collection_basename + ".queries/queries." + +percentages = ["0.0", "0.25", "0.50", "0.75"] +for perc in percentages: + for terms in range(1,7): + os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename) + os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " 7+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=7+ 2>> " + output_filename) diff --git a/archive/script/collect_locate_prefix_results_by_varying_percentage.py b/archive/script/collect_locate_prefix_results_by_varying_percentage.py new file mode 100644 index 0000000..305fafa --- /dev/null +++ b/archive/script/collect_locate_prefix_results_by_varying_percentage.py @@ -0,0 +1,14 @@ +import sys, os + +type = sys.argv[1] # 'trie' or 'fc' +collection_basename = sys.argv[2] +num_queries = sys.argv[3] + +output_filename = collection_basename + "." + type + ".locate_prefix.json" +query_filename_prefix = collection_basename + ".queries/queries." + +percentages = ["0.0", "0.25", "0.50", "0.75"] +for perc in percentages: + for terms in range(1,8): + os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename) + os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " 8+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename) diff --git a/archive/script/collect_results_by_varying_percentage.py b/archive/script/collect_results_by_varying_percentage.py new file mode 100644 index 0000000..c639032 --- /dev/null +++ b/archive/script/collect_results_by_varying_percentage.py @@ -0,0 +1,18 @@ +import sys, os + +index_type = sys.argv[1] +query_mode = sys.argv[2] # topk, prefix_topk, conjunctive_topk +index_filename = sys.argv[3] +collection_basename = sys.argv[4] # e.g., aol/aol.completions or aol/aol.completions.filtered +k = sys.argv[5] +num_queries = sys.argv[6] + +output_filename = collection_basename + "." + index_type +output_filename += "." + query_mode + ".json" +query_filename_prefix = collection_basename + ".queries/queries." + +percentages = ["0.0", "0.25", "0.50", "0.75"] +for perc in percentages: + for terms in range(1,7): + os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename) + os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 7+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=7+ 2>> " + output_filename) diff --git a/src/CMakeLists.txt b/archive/src/CMakeLists.txt similarity index 70% rename from src/CMakeLists.txt rename to archive/src/CMakeLists.txt index 7b000b1..1c5a82d 100644 --- a/src/CMakeLists.txt +++ b/archive/src/CMakeLists.txt @@ -2,3 +2,5 @@ add_executable(build build.cpp) add_executable(web_server web_server.cpp ../external/mongoose/mongoose.c) add_executable(output_ds2i_format output_ds2i_format.cpp) add_executable(statistics statistics.cpp) +# add_executable(check_topk check_topk.cpp) +add_executable(map_queries map_queries.cpp) \ No newline at end of file diff --git a/src/build.cpp b/archive/src/build.cpp similarity index 52% rename from src/build.cpp rename to archive/src/build.cpp index 732318f..ba73954 100644 --- a/src/build.cpp +++ b/archive/src/build.cpp @@ -2,57 +2,48 @@ #include "types.hpp" #include "statistics.hpp" +#include "../external/cmd_line_parser/include/parser.hpp" using namespace autocomplete; template -void build(parameters const& params, char const* output_filename) { +void build(parameters const& params, std::string const& output_filename) { Index index(params); index.print_stats(); - if (output_filename) { + if (output_filename != "") { essentials::logger("saving data structure to disk..."); - essentials::save(index, output_filename); + essentials::save(index, output_filename.c_str()); essentials::logger("DONE"); } } void build_type4(parameters const& params, const float c, - char const* output_filename) { + std::string const& output_filename) { ef_autocomplete_type4 index(params, c); index.print_stats(); - if (output_filename) { + if (output_filename != "") { essentials::logger("saving data structure to disk..."); - essentials::save(index, output_filename); + essentials::save(index, output_filename.c_str()); essentials::logger("DONE"); } } int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory + 1) { - std::cout << argv[0] - << " [-o output_filename] [-c c]" - << std::endl; - return 1; - } - - std::string type(argv[1]); + cmd_line_parser::parser parser(argc, argv); + parser.add("type", "Index type."); + parser.add("collection_basename", "Collection basename."); + parser.add("output_filename", "Output filename.", "-o", false); + parser.add( + "c", + "Value for Bast and Weber's technique: c must be a float in (0,1].", + "-c", false); + if (!parser.parse()) return 1; + + auto type = parser.get("type"); parameters params; - params.collection_basename = argv[2]; + params.collection_basename = parser.get("collection_basename"); params.load(); - - char const* output_filename = nullptr; - float c = 0.0; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } else if (std::string(argv[i]) == "-c") { - ++i; - c = std::stof(argv[i]); - } - } + auto output_filename = parser.get("output_filename"); if (type == "ef_type1") { build(params, output_filename); @@ -61,10 +52,7 @@ int main(int argc, char** argv) { } else if (type == "ef_type3") { build(params, output_filename); } else if (type == "ef_type4") { - if (c == 0.0) { - std::cerr << "c must be greater than 0.0" << std::endl; - return 1; - } + auto c = parser.get("c"); build_type4(params, c, output_filename); } else { return 1; diff --git a/archive/src/check_topk.cpp b/archive/src/check_topk.cpp new file mode 100644 index 0000000..cb466a1 --- /dev/null +++ b/archive/src/check_topk.cpp @@ -0,0 +1,64 @@ +#include + +#include "types.hpp" +#include "../benchmark/benchmark_common.hpp" + +using namespace autocomplete; + +template +void check_topk(char const* binary_filename1, char const* binary_filename2, + uint32_t k, uint32_t max_num_queries, float keep) { + Index index1; + ef_autocomplete_type1 index2; + essentials::load(index1, binary_filename1); + essentials::load(index2, binary_filename2); + std::vector queries; + load_queries(queries, max_num_queries, keep, std::cin); + for (auto const& query : queries) { + size_t n1 = index1.topk(query, k).size(); + size_t n2 = index2.topk(query, k).size(); + if (n1 != n2) { + std::cout << query << std::endl; + } + } +} + +int main(int argc, char** argv) { + int mandatory = 6; + if (argc < mandatory + 1) { + std::cout << argv[0] + << " " + " " + " < queries" + << std::endl; + std::cout << " is a float in [0,1] and specifies how much " + "we keep of the last token in a query " + << std::endl; + return 1; + } + + std::string type(argv[1]); + uint32_t k = std::atoi(argv[2]); + char const* binary_filename1 = argv[3]; + char const* binary_filename2 = argv[4]; + uint32_t max_num_queries = std::atoi(argv[5]); + float keep = std::atof(argv[6]); + + if (type == "ef_type1") { + check_topk(binary_filename1, binary_filename2, k, + max_num_queries, keep); + } else if (type == "ef_type2") { + check_topk(binary_filename1, binary_filename2, k, + max_num_queries, keep); + } else if (type == "ef_type3") { + check_topk(binary_filename1, binary_filename2, k, + max_num_queries, keep); + } else if (type == "ef_type4") { + check_topk(binary_filename1, binary_filename2, k, + max_num_queries, keep); + } else { + return 1; + } + + return 0; +} \ No newline at end of file diff --git a/archive/src/map_queries.cpp b/archive/src/map_queries.cpp new file mode 100644 index 0000000..de43df1 --- /dev/null +++ b/archive/src/map_queries.cpp @@ -0,0 +1,53 @@ +#include + +#include "types.hpp" + +using namespace autocomplete; + +template +completion_type parse(Dictionary const& dict, std::string const& query) { + completion_type completion; + byte_range_iterator it(string_to_byte_range(query)); + while (it.has_next()) { + byte_range term = it.next(); + auto term_id = dict.locate(term); + assert(term_id > 0); + assert(term_id != global::invalid_term_id); + completion.push_back(term_id - 1); + } + return completion; +} + +int main(int argc, char** argv) { + int mandatory = 2 + 1; + if (argc < mandatory) { + std::cout << argv[0] << " < queries" + << std::endl; + return 1; + } + + parameters params; + params.collection_basename = argv[1]; + params.load(); + + uint32_t num_queries = std::atoi(argv[2]); + + fc_dictionary_type dict; + { + fc_dictionary_type::builder builder(params); + builder.build(dict); + } + + std::string query; + for (uint32_t i = 0; i != num_queries; ++i) { + if (!std::getline(std::cin, query)) break; + auto completion = parse(dict, query); + std::cerr << completion.front(); + for (size_t i = 1; i != completion.size(); ++i) { + std::cerr << "\t" << completion[i]; + } + std::cerr << "\n"; + } + + return 0; +} \ No newline at end of file diff --git a/src/output_ds2i_format.cpp b/archive/src/output_ds2i_format.cpp similarity index 97% rename from src/output_ds2i_format.cpp rename to archive/src/output_ds2i_format.cpp index cc139c4..eb92509 100644 --- a/src/output_ds2i_format.cpp +++ b/archive/src/output_ds2i_format.cpp @@ -27,7 +27,7 @@ int main(int argc, char** argv) { { // write ds2i header uint32_t n = 1; - uint32_t universe = params.num_completions; + uint32_t universe = params.universe; docs.write(reinterpret_cast(&n), sizeof(uint32_t)); docs.write(reinterpret_cast(&universe), sizeof(uint32_t)); } diff --git a/src/statistics.cpp b/archive/src/statistics.cpp similarity index 58% rename from src/statistics.cpp rename to archive/src/statistics.cpp index 5b2148f..9dbf689 100644 --- a/src/statistics.cpp +++ b/archive/src/statistics.cpp @@ -2,25 +2,25 @@ #include "types.hpp" #include "statistics.hpp" +#include "../external/cmd_line_parser/include/parser.hpp" using namespace autocomplete; template -void print_stats(char const* index_filename) { +void print_stats(std::string const& index_filename) { Index index; - essentials::load(index, index_filename); + essentials::load(index, index_filename.c_str()); index.print_stats(); } int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory + 1) { - std::cout << argv[0] << " " << std::endl; - return 1; - } + cmd_line_parser::parser parser(argc, argv); + parser.add("type", "Index type."); + parser.add("index_filename", "Index filename."); + if (!parser.parse()) return 1; - std::string type(argv[1]); - char const* index_filename = argv[2]; + auto type = parser.get("type"); + auto index_filename = parser.get("index_filename"); if (type == "ef_type1") { print_stats(index_filename); diff --git a/src/web_server.cpp b/archive/src/web_server.cpp similarity index 92% rename from src/web_server.cpp rename to archive/src/web_server.cpp index 94a259b..db317fa 100644 --- a/src/web_server.cpp +++ b/archive/src/web_server.cpp @@ -5,6 +5,7 @@ #include "constants.hpp" #include "types.hpp" +#include "probe.hpp" #include "../external/mongoose/mongoose.h" @@ -26,7 +27,7 @@ std::string escape_json(std::string const& s) { using namespace autocomplete; -typedef ef_autocomplete_type3 topk_index_type; +typedef ef_autocomplete_type1 topk_index_type; static std::string s_http_port("8000"); static struct mg_serve_http_opts s_http_server_opts; @@ -53,9 +54,10 @@ static void ev_handler(struct mg_connection* nc, int ev, void* p) { } std::string data; - auto it = topk_index.topk(query, k); - // auto it = topk_index.prefix_topk(query, k); - // auto it = topk_index.conjunctive_topk(query, k); + nop_probe probe; + // auto it = topk_index.topk(query, k probe); + // auto it = topk_index.prefix_topk(query, k, probe); + auto it = topk_index.conjunctive_topk(query, k, probe); if (it.empty()) { data = "{\"suggestions\":[\"value\":\"\",\"data\":\"\"]}\n"; } else { diff --git a/archive/test/test_autocomplete.cpp b/archive/test/test_autocomplete.cpp new file mode 100644 index 0000000..8fe49cc --- /dev/null +++ b/archive/test/test_autocomplete.cpp @@ -0,0 +1,83 @@ +#include "test_common.hpp" + +using namespace autocomplete; + +typedef ef_autocomplete_type1 index_type; + +TEST_CASE("test autocomplete topk functions") { + char const* output_filename = testing::tmp_filename.c_str(); + parameters params; + params.collection_basename = testing::test_filename.c_str(); + params.load(); + + { + index_type index(params); + essentials::save(index, output_filename); + } + + { + index_type index; + essentials::load(index, output_filename); + + { + essentials::logger("testing prefix_topk()..."); + uint32_t k = 7; + std::vector queries = { + "a", "10", "african", + "air", "commercial", "internet", + "paris", "somerset", "the", + "the new", "the perfect", "the starting line", + "yu gi oh", "for sale", "dave mat", + "florence", "florida be", "for s", + "for sa", "for sal", "for sale", + "ford a", "ford au", "ford m", + "ford mu", "for", "fo", + "f", "matt", "fl", + "florir", "fly", "the starting l", + "floridaaa"}; + + nop_probe probe; + for (auto& query : queries) { + auto it = index.prefix_topk(query, k, probe); + std::cout << "top-" << it.size() << " completions for '" + << query << "':\n"; + for (uint32_t i = 0; i != it.size(); ++i, ++it) { + auto completion = *it; + std::cout << "(" << completion.score << ", '"; + print(completion.string); + std::cout << "')" << std::endl; + } + } + + essentials::logger("DONE"); + } + + { + essentials::logger("testing conjunctive_topk()..."); + uint32_t k = 7; + std::vector queries = { + "dave mat", "florence", "florida be", "for s", + "for sa", "for sal", "for sale", "ford a", + "ford au", "ford m", "ford mu", "for", + "fo", "f", "matt", "fl", + "flor", "fly", "the starting l"}; + + nop_probe probe; + for (auto& query : queries) { + auto it = index.conjunctive_topk(query, k, probe); + std::cout << "top-" << it.size() << " completions for '" + << query << "':\n"; + for (uint32_t i = 0; i != it.size(); ++i, ++it) { + auto completion = *it; + std::cout << "(" << completion.score << ", '"; + print(completion.string); + std::cout << "')" << std::endl; + } + } + + essentials::logger("DONE"); + } + } + + std::remove(output_filename); +} diff --git a/archive/test/test_blocked_inverted_index.cpp b/archive/test/test_blocked_inverted_index.cpp new file mode 100644 index 0000000..a2ede74 --- /dev/null +++ b/archive/test/test_blocked_inverted_index.cpp @@ -0,0 +1,63 @@ +#include "test_common.hpp" + +using namespace autocomplete; + +typedef ef_blocked_inverted_index blocked_inverted_index_type; +typedef ef_inverted_index inverted_index_type; + +TEST_CASE("test blocked_inverted_index::intersection_iterator") { + parameters params; + params.collection_basename = testing::test_filename.c_str(); + params.load(); + + inverted_index_type ii; + + { + inverted_index_type::builder ii_builder(params); + ii_builder.build(ii); + REQUIRE(ii.num_docs() == params.universe); + REQUIRE(ii.num_terms() == params.num_terms); + } + + { + static const uint32_t num_queries = 10000; + static const uint32_t max_num_terms = 3; + auto queries = testing::gen_random_queries(num_queries, max_num_terms, + params.num_terms); + + static const std::vector C = {0.0125, 0.025, 0.05, 0.1}; + blocked_inverted_index_type blocked_ii; + uint64_t total; + + for (auto c : C) { + total = 0; + { + blocked_inverted_index_type::builder blocked_ii_builder(params, + c); + blocked_ii_builder.build(blocked_ii); + } + + REQUIRE(blocked_ii.num_docs() == params.universe); + REQUIRE(blocked_ii.num_terms() == params.num_terms); + + for (auto& q : queries) { + auto ii_it = ii.intersection_iterator(q); + auto blocked_ii_it = + blocked_ii.intersection_iterator(q, {0, 0}); + + uint32_t n = 0; + for (; ii_it.has_next(); ++n, ++ii_it, ++blocked_ii_it) { + auto got = *blocked_ii_it; + auto expected = *ii_it; + REQUIRE_MESSAGE(got == expected, "expected doc_id " + << expected + << " but got " << got); + } + if (n) total += n; + REQUIRE(blocked_ii_it.has_next() == false); + } + + std::cout << total << std::endl; + } + } +} diff --git a/archive/test/test_common.hpp b/archive/test/test_common.hpp new file mode 100644 index 0000000..c17283f --- /dev/null +++ b/archive/test/test_common.hpp @@ -0,0 +1,88 @@ +#pragma once + +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include "../external/doctest/doctest/doctest.h" + +#include + +#include "types.hpp" +#include "probe.hpp" +#include "../benchmark/benchmark_common.hpp" + +namespace autocomplete { +namespace testing { + +static std::string test_filename( + "../test_data/trec_05_efficiency_queries/" + "trec_05_efficiency_queries.completions"); + +static std::string tmp_filename("tmp.bin"); + +id_type locate(std::vector const& terms, std::string const& t) { + return std::distance(terms.begin(), + std::lower_bound(terms.begin(), terms.end(), t)) + + 1; +} + +range locate_prefix(std::vector const& strings, + std::string const& p) { + auto comp_l = [](std::string const& l, std::string const& r) { + if (l.size() < r.size()) { + return strncmp(l.c_str(), r.c_str(), l.size()) <= 0; + } + return strcmp(l.c_str(), r.c_str()) < 0; + }; + + auto comp_r = [](std::string const& l, std::string const& r) { + if (l.size() < r.size()) { + return strncmp(l.c_str(), r.c_str(), l.size()) < 0; + } + return strcmp(l.c_str(), r.c_str()) < 0; + }; + + range r; + r.begin = std::distance( + strings.begin(), + std::lower_bound(strings.begin(), strings.end(), p, comp_l)); + r.end = std::distance( + strings.begin(), + std::upper_bound(strings.begin(), strings.end(), p, comp_r)); + + return r; +} + +typedef std::vector term_ids; + +std::vector gen_random_queries(uint32_t num_queries, + uint32_t max_num_terms, + uint32_t max_range_len) { + assert(max_num_terms > 1); + std::vector queries; + queries.reserve(num_queries); + essentials::uniform_int_rng random_num_terms(2, max_num_terms); + essentials::uniform_int_rng random_term_id(1, max_range_len); + + for (uint32_t i = 0; i != num_queries; ++i) { + term_ids q; + uint32_t num_terms = random_num_terms.gen(); + q.reserve(num_terms); + uint32_t num_distinct_terms = 0; + while (true) { + q.clear(); + for (uint32_t i = 0; i != num_terms; ++i) { + q.push_back(random_term_id.gen()); + } + std::sort(q.begin(), q.end()); + auto end = std::unique(q.begin(), q.end()); + num_distinct_terms = std::distance(q.begin(), end); + if (num_distinct_terms >= 2) break; + } + q.resize(num_distinct_terms); + queries.push_back(q); + } + + return queries; +} + +} // namespace testing +} // namespace autocomplete \ No newline at end of file diff --git a/archive/test/test_compact_forward_index.cpp b/archive/test/test_compact_forward_index.cpp new file mode 100644 index 0000000..dc78c07 --- /dev/null +++ b/archive/test/test_compact_forward_index.cpp @@ -0,0 +1,47 @@ +#include "test_common.hpp" + +using namespace autocomplete; + +TEST_CASE("test compact_forward_index::iterator") { + char const* output_filename = testing::tmp_filename.c_str(); + parameters params; + params.collection_basename = testing::test_filename.c_str(); + params.load(); + + { + compact_forward_index::builder builder(params); + compact_forward_index index; + builder.build(index); + REQUIRE(index.num_docs() == params.universe); + REQUIRE(index.num_terms() == params.num_terms); + essentials::save(index, output_filename); + } + + { + compact_forward_index index; + essentials::load(index, output_filename); + REQUIRE(index.num_docs() == params.universe); + REQUIRE(index.num_terms() == params.num_terms); + + std::ifstream input((params.collection_basename + ".forward").c_str(), + std::ios_base::in); + for (uint64_t i = 0; i != index.num_terms(); ++i) { + auto it = index.iterator(i); + uint32_t n = 0; + input >> n; + REQUIRE_MESSAGE(n == it.size(), "list has size " << it.size() + << " instead of " + << n); + for (uint64_t k = 0; k != n; ++k, ++it) { + id_type expected; + input >> expected; + auto got = *it; + REQUIRE_MESSAGE(got == expected, + "got " << got << " but expected " << expected); + } + } + input.close(); + + std::remove(output_filename); + } +}; diff --git a/archive/test/test_completion_trie.cpp b/archive/test/test_completion_trie.cpp new file mode 100644 index 0000000..c5155e1 --- /dev/null +++ b/archive/test/test_completion_trie.cpp @@ -0,0 +1,37 @@ +#include "test_common.hpp" + +using namespace autocomplete; + +typedef ef_completion_trie completion_trie_type; + +TEST_CASE("test completion_trie::is_member()") { + char const* output_filename = testing::tmp_filename.c_str(); + parameters params; + params.collection_basename = testing::test_filename.c_str(); + params.load(); + + { + completion_trie_type::builder builder(params); + completion_trie_type ct; + builder.build(ct); + REQUIRE(ct.size() == params.num_completions); + essentials::save(ct, output_filename); + } + + { + completion_trie_type ct; + essentials::load(ct, output_filename); + REQUIRE(ct.size() == params.num_completions); + std::ifstream input(params.collection_basename + ".mapped", + std::ios_base::in); + INFO("testing is_member()"); + completion_iterator it(params, input); + while (input) { + auto& record = *it; + REQUIRE(ct.is_member(record.completion)); + ++it; + } + input.close(); + std::remove(output_filename); + } +} diff --git a/archive/test/test_fc_dictionary.cpp b/archive/test/test_fc_dictionary.cpp new file mode 100644 index 0000000..50d12b0 --- /dev/null +++ b/archive/test/test_fc_dictionary.cpp @@ -0,0 +1,86 @@ +#include "test_common.hpp" + +using namespace autocomplete; + +TEST_CASE("test fc_dictionary") { + char const* output_filename = testing::tmp_filename.c_str(); + parameters params; + params.collection_basename = testing::test_filename.c_str(); + params.load(); + + { + fc_dictionary_type::builder builder(params); + fc_dictionary_type dict; + builder.build(dict); + essentials::save(dict, output_filename); + } + + { + fc_dictionary_type dict; + essentials::load(dict, output_filename); + + // test locate() and extract for all strings + std::vector terms; + terms.reserve(params.num_terms); + std::ifstream input((params.collection_basename + ".dict").c_str(), + std::ios_base::in); + if (!input.good()) { + throw std::runtime_error("File not found"); + } + std::string term; + term.reserve(256 + 1); + input >> term; + while (input) { + terms.push_back(std::move(term)); + input >> term; + } + input.close(); + + std::vector decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY); + + for (auto const& t : terms) { + id_type expected = testing::locate(terms, t); + id_type got = dict.locate(string_to_byte_range(t)); + + REQUIRE_MESSAGE(got == expected, "expected id " << expected + << ", but got id " + << got); + + uint8_t string_len = dict.extract(got, decoded.data()); + REQUIRE_MESSAGE(string_len == t.size(), + "expected size " << t.size() << ", but got size " + << string_len); + + auto s = reinterpret_cast(decoded.data()); + for (uint8_t i = 0; i != string_len; ++i) { + REQUIRE_MESSAGE(t[i] == s[i], "expected char " << t[i] + << " but got " + << s[i]); + } + } + + // test locate_prefix() for all strings + std::string prefix; + prefix.reserve(256 + 1); + for (auto const& t : terms) { + uint32_t n = t.size(); + for (uint32_t prefix_len = 1; prefix_len <= n; ++prefix_len) { + prefix.clear(); + for (uint32_t i = 0; i != prefix_len; ++i) { + prefix.push_back(t[i]); + } + + range expected = testing::locate_prefix(terms, prefix); + range got = dict.locate_prefix(string_to_byte_range(prefix)); + REQUIRE_MESSAGE((got.begin == expected.begin and + got.end == expected.end - 1), + "Error for prefix '" + << prefix << "' : expected [" + << expected.begin << "," << expected.end - 1 + << "] but got [" << got.begin << "," + << got.end << "]"); + } + } + std::remove(output_filename); + } +} diff --git a/archive/test/test_integer_fc_dictionary.cpp b/archive/test/test_integer_fc_dictionary.cpp new file mode 100644 index 0000000..d36db82 --- /dev/null +++ b/archive/test/test_integer_fc_dictionary.cpp @@ -0,0 +1,63 @@ +#include "test_common.hpp" + +using namespace autocomplete; + +TEST_CASE("test integer_fc_dictionary") { + char const* output_filename = testing::tmp_filename.c_str(); + parameters params; + params.collection_basename = testing::test_filename.c_str(); + params.load(); + + { + integer_fc_dictionary_type::builder builder(params); + integer_fc_dictionary_type dict; + builder.build(dict); + essentials::save(dict, output_filename); + } + + { + integer_fc_dictionary_type dict; + essentials::load(dict, output_filename); + + { + std::ifstream input( + (params.collection_basename + ".mapped").c_str(), + std::ios_base::in); + completion_iterator it(params, input); + + completion_type decoded(2 * constants::MAX_NUM_TERMS_PER_QUERY); + for (id_type id = 0; id != params.num_completions; ++id, ++it) { + auto const& expected = (*it).completion; + REQUIRE(expected.size() > 0); + uint8_t size = dict.extract(id, decoded); + + REQUIRE_MESSAGE(expected.size() - 1 == size, + "Error in decoding the " + << id << "-th string: expected size " + << expected.size() - 1 << "," + << " but got size " << int(size)); + + for (uint8_t i = 0; i != size; ++i) { + REQUIRE_MESSAGE(decoded[i] == expected[i], + "Error in decoding the " + << id << "-th string: expected " + << expected[i] << "," + << " but got " << decoded[i] + << " at position " << int(i)); + } + + id_type got_id = + dict.locate({decoded.data(), decoded.data() + size}); + REQUIRE(got_id != global::invalid_term_id); + REQUIRE_MESSAGE(got_id == id, "Error in locating the " + << id + << "-th string: expected id " + << id << "," + << " but got id " << got_id); + } + + input.close(); + } + std::remove(output_filename); + } +} diff --git a/archive/test/test_inverted_index.cpp b/archive/test/test_inverted_index.cpp new file mode 100644 index 0000000..5faa823 --- /dev/null +++ b/archive/test/test_inverted_index.cpp @@ -0,0 +1,135 @@ +#include "test_common.hpp" + +using namespace autocomplete; + +typedef ef_inverted_index inverted_index_type; + +TEST_CASE("test inverted_index::iterator") { + char const* output_filename = testing::tmp_filename.c_str(); + parameters params; + params.collection_basename = testing::test_filename.c_str(); + params.load(); + + { + inverted_index_type::builder builder(params); + inverted_index_type index; + builder.build(index); + REQUIRE(index.num_docs() == params.universe); + REQUIRE(index.num_terms() == params.num_terms); + essentials::save(index, output_filename); + } + + { + inverted_index_type index; + essentials::load(index, output_filename); + REQUIRE(index.num_docs() == params.universe); + REQUIRE(index.num_terms() == params.num_terms); + + std::ifstream input((params.collection_basename + ".inverted").c_str(), + std::ios_base::in); + for (uint64_t i = 0; i != index.num_terms(); ++i) { + auto it = index.iterator(i); + uint32_t n = 0; + input >> n; + REQUIRE_MESSAGE(n == it.size(), "list has size " << it.size() + << " instead of " + << n); + for (uint64_t k = 0; k != n; ++k, ++it) { + id_type expected; + input >> expected; + auto got = *it; + REQUIRE_MESSAGE(got == expected, + "got " << got << " but expected " << expected); + } + } + input.close(); + + std::remove(output_filename); + } +}; + +TEST_CASE("test inverted_index::intersection_iterator") { + char const* output_filename = testing::tmp_filename.c_str(); + parameters params; + params.collection_basename = testing::test_filename.c_str(); + params.load(); + + { + inverted_index_type::builder builder(params); + inverted_index_type index; + builder.build(index); + REQUIRE(index.num_docs() == params.universe); + REQUIRE(index.num_terms() == params.num_terms); + essentials::save(index, output_filename); + } + + { + inverted_index_type index; + essentials::load(index, output_filename); + REQUIRE(index.num_docs() == params.universe); + REQUIRE(index.num_terms() == params.num_terms); + + static const uint32_t num_queries = 1000000; + static const uint32_t max_num_terms = 5; + auto queries = testing::gen_random_queries(num_queries, max_num_terms, + index.num_terms()); + + std::vector first(index.num_docs()); + std::vector second(index.num_docs()); + std::vector intersection(index.num_docs()); + + for (auto const& q : queries) { + uint32_t first_size = 0; + uint32_t second_size = 0; + assert(q.size() >= 2); + + { + auto it = index.iterator(q[0] - 1); + first_size = it.size(); + for (uint32_t i = 0; i != first_size; ++i) { + first[i] = it.access(i); + } + } + + { + auto it = index.iterator(q[1] - 1); + second_size = it.size(); + for (uint32_t i = 0; i != second_size; ++i) { + second[i] = it.access(i); + } + } + + auto end = std::set_intersection( + first.begin(), first.begin() + first_size, second.begin(), + second.begin() + second_size, intersection.begin()); + first_size = std::distance(intersection.begin(), end); + first.swap(intersection); + + for (uint32_t i = 2; i != q.size(); ++i) { + auto it = index.iterator(q[i] - 1); + second_size = it.size(); + for (uint32_t i = 0; i != second_size; ++i) { + second[i] = it.access(i); + } + end = std::set_intersection( + first.begin(), first.begin() + first_size, second.begin(), + second.begin() + second_size, intersection.begin()); + first_size = std::distance(intersection.begin(), end); + first.swap(intersection); + } + + auto it = index.intersection_iterator(q); + uint32_t n = 0; + for (; it.has_next(); ++n, ++it) { + auto doc_id = *it; + REQUIRE_MESSAGE( + doc_id == first[n], + "expected doc_id " << first[n] << " but got " << doc_id); + } + REQUIRE_MESSAGE(n == first_size, "expected " << first_size + << " results, but got " + << n); + } + std::remove(output_filename); + } +} diff --git a/archive/test/test_locate_prefix.cpp b/archive/test/test_locate_prefix.cpp new file mode 100644 index 0000000..1a81693 --- /dev/null +++ b/archive/test/test_locate_prefix.cpp @@ -0,0 +1,102 @@ +#include "test_common.hpp" + +using namespace autocomplete; + +typedef ef_completion_trie completion_trie_type; + +template +void test_locate_prefix(Dictionary const& dict, Index const& index, + std::vector const& queries, + std::vector const& strings) { + for (auto const& query : queries) { + range expected = testing::locate_prefix(strings, query); + completion_type prefix; + byte_range suffix; + parse(dict, query, prefix, suffix, true); + + range suffix_lex_range = dict.locate_prefix(suffix); + suffix_lex_range.begin += 1; + suffix_lex_range.end += 1; + range got = index.locate_prefix(prefix, suffix_lex_range); + + CHECK_MESSAGE((got.begin == expected.begin and got.end == expected.end), + "Error for query '" + << query << "': expected [" << expected.begin << "," + << expected.end << ") but got [" << got.begin << "," + << got.end << ")"); + } +} + +TEST_CASE("test locate_prefix()") { + parameters params; + params.collection_basename = testing::test_filename.c_str(); + params.load(); + + fc_dictionary_type dict; + { + fc_dictionary_type::builder builder(params); + builder.build(dict); + } + + std::vector strings; + + { + essentials::logger("loading all strings..."); + std::string line; + strings.reserve(params.num_completions); + std::ifstream input((params.collection_basename).c_str(), + std::ios_base::in); + for (uint32_t i = 0; i != params.num_completions; ++i) { + if (!std::getline(input, line)) break; + auto s = line.substr(line.find(' ') + 1, line.size()); + strings.push_back(s); + } + input.close(); + essentials::logger("loaded " + std::to_string(strings.size()) + + " strings"); + } + + constexpr uint32_t max_num_queries = 5000; + std::vector queries; + static std::vector percentages = {0.0, 0.25, 0.50, 0.75, 1.0}; + static std::vector query_terms = {1, 2, 3, 4, 5, 6, 7}; + + completion_trie_type ct_index; + integer_fc_dictionary_type fc_index; + + { + completion_trie_type::builder builder(params); + builder.build(ct_index); + REQUIRE(ct_index.size() == params.num_completions); + } + + { + integer_fc_dictionary_type::builder builder(params); + builder.build(fc_index); + REQUIRE(fc_index.size() == params.num_completions); + } + + for (auto perc : percentages) { + for (auto num_terms : query_terms) { + std::cout << "percentage " << perc * 100.0 << "%, num_terms " + << num_terms << std::endl; + { + queries.clear(); + std::string filename = + params.collection_basename + + ".queries/queries.length=" + std::to_string(num_terms); + std::ifstream querylog(filename.c_str()); + if (!querylog.is_open()) { + std::cerr << "cannot open file '" << filename << "'" + << std::endl; + return; + } + load_queries(queries, max_num_queries, perc, querylog); + querylog.close(); + } + + test_locate_prefix(dict, ct_index, queries, strings); + test_locate_prefix(dict, fc_index, queries, strings); + } + } +} diff --git a/archive/test/test_unsorted_list.cpp b/archive/test/test_unsorted_list.cpp new file mode 100644 index 0000000..2760532 --- /dev/null +++ b/archive/test/test_unsorted_list.cpp @@ -0,0 +1,172 @@ +#include "test_common.hpp" + +using namespace autocomplete; + +uint32_t naive_topk(std::vector const& input, range r, uint32_t k, + std::vector& topk, bool unique = false) { + uint32_t range_len = r.end - r.begin; + for (uint32_t i = 0; i != range_len; ++i) { + topk[i] = input[r.begin + i]; + } + std::sort(topk.begin(), topk.begin() + range_len); + uint32_t results = 0; + if (unique) { + auto end = std::unique(topk.begin(), topk.begin() + range_len); + results = std::min(k, std::distance(topk.begin(), end)); + } else { + results = std::min(k, range_len); + } + return results; +} + +std::vector gen_random_queries(uint32_t num_queries, + uint32_t max_range_len) { + std::vector queries; + queries.reserve(num_queries); + essentials::uniform_int_rng random(0, max_range_len); + for (uint32_t i = 0; i != num_queries; ++i) { + uint32_t x = random.gen(); + uint32_t y = random.gen(); + range r; + if (y > x) { + r = {x, y}; + } else { + r = {y, x}; + } + queries.push_back(r); + } + return queries; +} + +TEST_CASE("test unsorted_list on doc_ids") { + char const* output_filename = testing::tmp_filename.c_str(); + parameters params; + params.collection_basename = testing::test_filename.c_str(); + params.load(); + + static const uint32_t k = 10; + static_assert(k <= constants::MAX_K, "k must be less than max allowed"); + static const uint32_t num_queries = 5000; + + std::vector doc_ids; + + { + doc_ids.reserve(params.num_completions); + std::ifstream input(params.collection_basename + ".mapped", + std::ios_base::in); + completion_iterator it(params, input); + while (input) { + auto const& record = *it; + doc_ids.push_back(record.doc_id); + ++it; + } + input.close(); + + // { + // // must have all ids from 0 to doc_ids.size() - 1 + // // NOTE: not true if we filter out some strings to be used as + // // queries + // std::vector tmp = doc_ids; + // std::sort(tmp.begin(), tmp.end()); + // for (id_type id = 0; id != doc_ids.size(); ++id) { + // REQUIRE_MESSAGE(tmp[id] == id, + // "Error: id " << id << " not found"); + // } + // } + + unsorted_list_type list; + list.build(doc_ids); + REQUIRE(list.size() == doc_ids.size()); + essentials::save(list, output_filename); + } + + { + unsorted_list_type list; + essentials::load(list, output_filename); + + std::vector topk(constants::MAX_K); + auto queries = gen_random_queries(num_queries, doc_ids.size()); + std::vector expected(params.num_completions); + + for (auto q : queries) { + uint32_t expected_results = naive_topk(doc_ids, q, k, expected); + uint32_t results = list.topk(q, k, topk); + REQUIRE_MESSAGE(expected_results == results, + "Error: expected " << expected_results + << " topk elements but got " + << results); + for (uint32_t i = 0; i != results; ++i) { + REQUIRE_MESSAGE(topk[i] == expected[i], + "Error: expected " << expected[i] << " but got " + << topk[i]); + } + } + + std::remove(output_filename); + } +} + +TEST_CASE("test unsorted_list on minimal doc_ids") { + char const* output_filename = testing::tmp_filename.c_str(); + parameters params; + params.collection_basename = testing::test_filename.c_str(); + params.load(); + + static const uint32_t k = 10; + static_assert(k <= constants::MAX_K, "k must be less than max allowed"); + static const uint32_t num_queries = 5000; + + std::vector doc_ids; + + { + doc_ids.reserve(params.num_terms); + std::ifstream input((params.collection_basename + ".inverted").c_str(), + std::ios_base::in); + id_type first; + for (uint64_t i = 0; i != params.num_terms; ++i) { + uint32_t n = 0; + input >> n; + input >> first; + doc_ids.push_back(first); + for (uint64_t k = 1; k != n; ++k) { + id_type x; + input >> x; + (void)x; // discard + } + } + input.close(); + REQUIRE(doc_ids.size() == params.num_terms); + + unsorted_list_type list; + list.build(doc_ids); + REQUIRE(list.size() == doc_ids.size()); + essentials::save(list, output_filename); + } + + { + unsorted_list_type list; + essentials::load(list, output_filename); + + std::vector topk(constants::MAX_K); + auto queries = gen_random_queries(num_queries, doc_ids.size()); + constexpr bool unique = true; + std::vector expected(params.num_terms); + + for (auto q : queries) { + uint32_t expected_results = + naive_topk(doc_ids, q, k, expected, unique); + uint32_t results = list.topk(q, k, topk, unique); + REQUIRE_MESSAGE(expected_results == results, + "Error: expected " << expected_results + << " topk elements but got " + << results); + for (uint32_t i = 0; i != results; ++i) { + REQUIRE_MESSAGE(topk[i] == expected[i], + "Error: expected " << expected[i] << " but got " + << topk[i]); + } + } + + std::remove(output_filename); + } +} \ No newline at end of file diff --git a/test_data/build_inverted_and_forward.py b/archive/test_data/build_inverted_and_forward.py similarity index 89% rename from test_data/build_inverted_and_forward.py rename to archive/test_data/build_inverted_and_forward.py index c627699..0966d99 100644 --- a/test_data/build_inverted_and_forward.py +++ b/archive/test_data/build_inverted_and_forward.py @@ -1,5 +1,4 @@ import sys -import numpy as np input_filename = sys.argv[1] @@ -20,9 +19,11 @@ num_docs = 0 with open(input_filename + ".mapped.stats") as f: num_terms = int(f.readline()) - print num_terms + print("terms: " + str(num_terms)) + f.readline() # skip line: max num. of query terms + f.readline() # skip line: num. of completions num_docs = int(f.readline()) - print num_docs + print("universe: " + str(num_docs)) inverted_index = [[] for i in range(num_terms + 1)] # id 0 is not assigned forward_index = [[] for i in range(num_docs)] @@ -35,7 +36,7 @@ discard = False for i in range(1, len(x)): try: - term = x[i].encode('utf-8') + term = x[i] try: term_id = tokens[term] if term_id not in mapped: @@ -51,7 +52,7 @@ if not discard: # NOTE: not sorted! if doc_id >= num_docs: - print doc_id,num_docs + print(doc_id,num_docs) forward_index[doc_id] = mapped; lines += 1 diff --git a/test_data/build_stats.py b/archive/test_data/build_stats.py similarity index 76% rename from test_data/build_stats.py rename to archive/test_data/build_stats.py index f9923f0..880bcd3 100644 --- a/test_data/build_stats.py +++ b/archive/test_data/build_stats.py @@ -8,10 +8,17 @@ output_file = open(input_filename + ".stats", 'a') prev = [] +universe = 0; with open(input_filename, 'r') as f: for line in f: x = line.rstrip('\n').split() + docid = int(x[0]) + + if docid > universe: + universe = docid + q = x[1:len(x)] + level_id = 0 while level_id < len(q) and level_id < len(prev) and q[level_id] == prev[level_id]: level_id += 1 @@ -31,8 +38,12 @@ # number of completions # number of levels in the trie # number of nodes for each level +print("universe: " + str(universe + 1)) +print("completions: " + str(lines)) output_file.write(str(lines) + "\n") +output_file.write(str(universe + 1) + "\n") output_file.write(str(len(nodes_per_level)) + "\n") -for key, value in sorted(nodes_per_level.iteritems(), key = lambda kv: kv[0]): +for key, value in sorted(nodes_per_level.items(), key = lambda kv: kv[0]): output_file.write(str(value) + "\n") -output_file.close() \ No newline at end of file +output_file.close() + diff --git a/test_data/extract_dict.py b/archive/test_data/extract_dict.py similarity index 78% rename from test_data/extract_dict.py rename to archive/test_data/extract_dict.py index 875f85b..e9b48d0 100644 --- a/test_data/extract_dict.py +++ b/archive/test_data/extract_dict.py @@ -1,9 +1,8 @@ import sys -from sets import Set input_filename = sys.argv[1] -tokens = Set({}) +tokens = set() lines = 0 print("parsing input file...") @@ -14,12 +13,12 @@ tokens.add(x[i]) lines += 1 if lines % 1000000 == 0: - print "processed " + str(lines) + " lines" + print("processed " + str(lines) + " lines") print("processed " + str(lines) + " lines") print("dictionary has " + str(len(tokens)) + " keys") dict_file = open(input_filename + ".dict", 'w') for key in sorted(tokens): - dict_file.write(key.encode('utf-8') + "\n") + dict_file.write(key + "\n") dict_file.close() \ No newline at end of file diff --git a/archive/test_data/filter_and_preprocess.sh b/archive/test_data/filter_and_preprocess.sh new file mode 100644 index 0000000..9a5d787 --- /dev/null +++ b/archive/test_data/filter_and_preprocess.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +echo $1 # input filename + +# number of completions to exclude per completion size, +# e.g., if it is 100, then at most 7 x 100 completions are filtered out +echo $2 + +python partition_queries_by_length.py $1 $1.filtered.queries $2 +python filter_dataset.py $1 $1.filtered.queries +python extract_dict.py $1.filtered +python map_dataset.py $1.filtered +python build_stats.py $1.filtered.mapped +python build_inverted_and_forward.py $1.filtered diff --git a/archive/test_data/filter_dataset.py b/archive/test_data/filter_dataset.py new file mode 100644 index 0000000..dc68a28 --- /dev/null +++ b/archive/test_data/filter_dataset.py @@ -0,0 +1,32 @@ +import sys +from sets import Set + +input_filename = sys.argv[1] +queries_directory = sys.argv[2] + +to_filter = Set({}) +print("loading strings to filter...") +for i in range(1,7): + with open(queries_directory + "/queries.length=" + str(i)) as f: + for line in f: + s = line.rstrip('\n') + to_filter.add(s) +with open(queries_directory + "/queries.length=7+") as f: + for line in f: + s = line.rstrip('\n') + to_filter.add(s) + +lines = 0 +print("filtering dataset...") + +output_file = open(input_filename + ".filtered", 'w') +with open(input_filename, 'r') as f: + for line in f: + x = line.rstrip('\n').split() + string = ' '.join(x[1:len(x)]) + if string not in to_filter: + output_file.write(line) + lines += 1 + if lines % 1000000 == 0: + print("processed " + str(lines) + " lines") +output_file.close() \ No newline at end of file diff --git a/test_data/map_dataset.py b/archive/test_data/map_dataset.py similarity index 95% rename from test_data/map_dataset.py rename to archive/test_data/map_dataset.py index 86e6357..beb7155 100644 --- a/test_data/map_dataset.py +++ b/archive/test_data/map_dataset.py @@ -24,7 +24,7 @@ string_len = 0; mapped = [x[0]] for i in range(1, len(x)): # x[0] stores the docID - t = x[i].encode('utf-8') + t = x[i] try: id = tokens[t] mapped.append(id) @@ -48,4 +48,4 @@ stats_file.write(str(len(tokens)) + "\n") stats_file.write(str(max_string_len) + "\n") -stats_file.close() \ No newline at end of file +stats_file.close() diff --git a/archive/test_data/partition_queries_by_length.py b/archive/test_data/partition_queries_by_length.py new file mode 100644 index 0000000..3d3823b --- /dev/null +++ b/archive/test_data/partition_queries_by_length.py @@ -0,0 +1,40 @@ +import sys, os, random + +input_filename = sys.argv[1] +output_directory = sys.argv[2] +n = int(sys.argv[3]) + +if not os.path.exists(output_directory): + os.makedirs(output_directory) + +num_shards = 6 +files = [open(output_directory + "/queries.length=" + str(i), "w") for i in range(1,num_shards + 1)] +all_others = open(output_directory + "/queries.length=" + str(num_shards + 1) + "+", "w") + +strings = [[] for i in range(num_shards)] +all_others_strings = [] + +lines = 0 +with open(input_filename, 'r') as f: + for line in f: + x = line.rstrip('\n').split() + l = len(x) - 1 + string = ' '.join(x[1:l+1]) + '\n' + if l > num_shards: + all_others_strings.append(string) + else: + strings[l - 1].append(string) + lines += 1 + if lines % 1000000 == 0: + print("processed " + str(lines) + " lines") + +for i in range(num_shards): + random.shuffle(strings[i]) + for k in range(min(n, len(strings[i]))): + files[i].write(strings[i][k]) + files[i].close() + +random.shuffle(all_others_strings) +for k in range(min(n, len(all_others_strings))): + all_others.write(all_others_strings[k]) +all_others.close() diff --git a/archive/test_data/preprocess.sh b/archive/test_data/preprocess.sh new file mode 100755 index 0000000..b795bfe --- /dev/null +++ b/archive/test_data/preprocess.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +echo $1 # input filename +echo $2 # number of queries for each size +python3 extract_dict.py $1 +python3 map_dataset.py $1 +python3 build_stats.py $1.mapped +python3 build_inverted_and_forward.py $1 +python3 partition_queries_by_length.py $1 $1.queries $2 diff --git a/test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions b/archive/test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions similarity index 100% rename from test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions rename to archive/test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions diff --git a/web/index.html b/archive/web/index.html similarity index 100% rename from web/index.html rename to archive/web/index.html diff --git a/web/jquery-1.8.2.min.js b/archive/web/jquery-1.8.2.min.js similarity index 100% rename from web/jquery-1.8.2.min.js rename to archive/web/jquery-1.8.2.min.js diff --git a/web/jquery.autocomplete.js b/archive/web/jquery.autocomplete.js similarity index 100% rename from web/jquery.autocomplete.js rename to archive/web/jquery.autocomplete.js diff --git a/web/styles.css b/archive/web/styles.css similarity index 93% rename from web/styles.css rename to archive/web/styles.css index 5db5234..b540533 100644 --- a/web/styles.css +++ b/archive/web/styles.css @@ -9,4 +9,4 @@ .autocomplete-group { padding: 2px 5px; } .autocomplete-group strong { font-weight: bold; font-size: 16px; color: #000; display: block; border-bottom: 1px solid #000; } -input { font-size: 28px; padding: 10px; border: 1px solid #CCC; display: block; margin: 20px 0; } +input { font-size: 18px; padding: 10px; border: 1px solid #CCC; display: block; margin: 20px 0; } diff --git a/web/topkcomp.js b/archive/web/topkcomp.js similarity index 100% rename from web/topkcomp.js rename to archive/web/topkcomp.js diff --git a/autocomplete-rs/.gitignore b/autocomplete-rs/.gitignore new file mode 100644 index 0000000..da95885 --- /dev/null +++ b/autocomplete-rs/.gitignore @@ -0,0 +1,18 @@ +# Cargo +target/ + +# IDEs +.vscode/ +.idea/ + +# OS +.DS_Store + +# Rust + +# Build +build.rs + +# Cargo.lock +Cargo.lock + diff --git a/autocomplete-rs/Cargo.lock b/autocomplete-rs/Cargo.lock new file mode 100644 index 0000000..6222344 --- /dev/null +++ b/autocomplete-rs/Cargo.lock @@ -0,0 +1,857 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "addr2line" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6680de5231bd6ee4c6191b8a1325daa282b415391ec9d3a37bd34f2060dc73fa" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.59.0", +] + +[[package]] +name = "anyhow" +version = "1.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "autocomplete-rs" +version = "0.1.0" +dependencies = [ + "clap", + "futures", + "tempfile", + "tokio", + "tonic-build", +] + +[[package]] +name = "backtrace" +version = "0.3.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6806a6321ec58106fea15becdad98371e28d92ccbc7c8f1b3b6dd724fe8f1002" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-targets", +] + +[[package]] +name = "bitflags" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" + +[[package]] +name = "bytes" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clap" +version = "4.5.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed93b9805f8ba930df42c2590f05453d5ec36cbb85d018868a5b24d31f6ac000" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "379026ff283facf611b0ea629334361c4211d1b12ee01024eec1591133b04120" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cea14ef9355e3beab063703aa9dab15afd25f0667c341310c1e5274bb1d0da18" +dependencies = [ + "libc", + "windows-sys 0.59.0", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + +[[package]] +name = "futures" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" + +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + +[[package]] +name = "futures-task" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" + +[[package]] +name = "futures-util" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "getrandom" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasi 0.14.2+wasi-0.2.4", +] + +[[package]] +name = "gimli" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" + +[[package]] +name = "hashbrown" +version = "0.15.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indexmap" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "libc" +version = "0.2.172" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" + +[[package]] +name = "linux-raw-sys" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" + +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "miniz_oxide" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be647b768db090acb35d5ec5db2b0e1f1de11133ca123b9eacf5137868f892a" +dependencies = [ + "adler2", +] + +[[package]] +name = "mio" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" +dependencies = [ + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys 0.52.0", +] + +[[package]] +name = "multimap" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" + +[[package]] +name = "object" +version = "0.36.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" + +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "petgraph" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +dependencies = [ + "fixedbitset", + "indexmap", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "prettyplease" +version = "0.2.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "664ec5419c51e34154eec046ebcba56312d5a2fc3b09a06da188e1ad21afadf6" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "prost" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" +dependencies = [ + "bytes", + "heck", + "itertools", + "log", + "multimap", + "once_cell", + "petgraph", + "prettyplease", + "prost", + "prost-types", + "regex", + "syn", + "tempfile", +] + +[[package]] +name = "prost-derive" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-types" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0" +dependencies = [ + "prost", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" + +[[package]] +name = "redox_syscall" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "928fca9cf2aa042393a8325b9ead81d2f0df4cb12e1e24cef072922ccd99c5af" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + +[[package]] +name = "rustix" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.59.0", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "signal-hook-registry" +version = "1.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9203b8055f63a2a00e2f593bb0510367fe707d7ff1e5c872de2f537b339e5410" +dependencies = [ + "libc", +] + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "smallvec" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" + +[[package]] +name = "socket2" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1" +dependencies = [ + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys 0.59.0", +] + +[[package]] +name = "tokio" +version = "1.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2513ca694ef9ede0fb23fe71a4ee4107cb102b9dc1930f6d0fd77aae068ae165" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys 0.52.0", +] + +[[package]] +name = "tokio-macros" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tonic-build" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d021fc044c18582b9a2408cd0dd05b1596e3ecdb5c4df822bb0183545683889" +dependencies = [ + "prettyplease", + "proc-macro2", + "prost-build", + "quote", + "syn", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasi" +version = "0.14.2+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" +dependencies = [ + "wit-bindgen-rt", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "wit-bindgen-rt" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" +dependencies = [ + "bitflags", +] diff --git a/autocomplete-rs/Cargo.toml b/autocomplete-rs/Cargo.toml new file mode 100644 index 0000000..68ed87f --- /dev/null +++ b/autocomplete-rs/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "autocomplete-rs" +version = "0.1.0" +edition = "2021" + +[dependencies] +tokio = { version = "1.0", features = ["full"] } +futures = "0.3" +clap = { version = "4.4", features = ["derive"] } + +[dev-dependencies] +tempfile = "3.8" + +[build-dependencies] +tonic-build = "0.10" diff --git a/autocomplete-rs/LICENSE b/autocomplete-rs/LICENSE new file mode 100644 index 0000000..d874d0b --- /dev/null +++ b/autocomplete-rs/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Autocomplete Service Contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/autocomplete-rs/README.md b/autocomplete-rs/README.md new file mode 100644 index 0000000..4c11811 --- /dev/null +++ b/autocomplete-rs/README.md @@ -0,0 +1,118 @@ +# Autocomplete Service + +A high-performance autocomplete service written in Rust, supporting both gRPC and GraphQL interfaces. + +## Features + +- **Dual API Support** + - gRPC interface for high-performance RPC calls + - GraphQL interface for flexible querying + - Shared backend implementation for both APIs + +- **Core Features** + - Fast prefix-based autocomplete + - Score-based ranking of suggestions + - Memory-efficient string storage + - Concurrent request handling + +- **API Endpoints** + - gRPC: `[::1]:50051` (configurable) + - GraphQL: `[::1]:8000/graphql` (configurable) + - GraphQL Playground: `[::1]:8000/playground` + +## Project Status + +### Completed +- ✅ Basic autocomplete implementation +- ✅ gRPC server implementation +- ✅ GraphQL server implementation +- ✅ Command-line configuration +- ✅ Shared backend between APIs + +### In Progress +- 🔄 Documentation +- 🔄 Testing suite +- 🔄 Performance benchmarks + +### Planned +- ⏳ Authentication +- ⏳ Rate limiting +- ⏳ Metrics and monitoring +- ⏳ Docker support +- ⏳ Client examples in multiple languages + +## Getting Started + +### Prerequisites +- Rust 1.70 or later +- Cargo + +### Building +```bash +cargo build --release +``` + +### Running +```bash +# Default configuration +cargo run + +# Custom addresses +cargo run -- --grpc-addr 127.0.0.1:50051 --graphql-addr 127.0.0.1:8000 + +# Show help +cargo run -- --help +``` + +## API Usage + +### gRPC +```protobuf +service AutocompleteService { + rpc Complete(CompleteRequest) returns (CompleteResponse); + rpc Init(InitRequest) returns (InitResponse); + rpc GetStats(StatsRequest) returns (StatsResponse); +} +``` + +### GraphQL +```graphql +type Query { + complete(prefix: String!, maxResults: Int): CompleteResponse! + stats: StatsResponse! +} + +type Mutation { + init(strings: [StringInput!]!): InitResponse! +} +``` + +## Project Structure + +``` +autocomplete-rs/ +├── src/ +│ ├── main.rs # Entry point and CLI +│ ├── autocomplete.rs # Core autocomplete logic +│ ├── graphql.rs # GraphQL schema and resolvers +│ ├── server.rs # Server implementations +│ ├── string_pool.rs # String interning +│ ├── trie.rs # Trie data structure +│ └── types.rs # Common types +├── proto/ +│ └── autocomplete.proto # gRPC service definition +└── schema/ + └── schema.graphql # GraphQL schema +``` + +## Contributing + +1. Fork the repository +2. Create your feature branch (`git checkout -b feature/amazing-feature`) +3. Commit your changes (`git commit -m 'Add amazing feature'`) +4. Push to the branch (`git push origin feature/amazing-feature`) +5. Open a Pull Request + +## License + +This project is licensed under the MIT License - see the LICENSE file for details. \ No newline at end of file diff --git a/autocomplete-rs/build.rs b/autocomplete-rs/build.rs new file mode 100644 index 0000000..ed0ba48 --- /dev/null +++ b/autocomplete-rs/build.rs @@ -0,0 +1,3 @@ +fn main() { + // No build-time code generation needed +} \ No newline at end of file diff --git a/autocomplete-rs/examples/client.rs b/autocomplete-rs/examples/client.rs new file mode 100644 index 0000000..cbdb2c9 --- /dev/null +++ b/autocomplete-rs/examples/client.rs @@ -0,0 +1,36 @@ +use autocomplete_proto::{ + autocomplete_service_client::AutocompleteServiceClient, + CompleteRequest, InitRequest, StringScore, +}; + +pub mod autocomplete_proto { + tonic::include_proto!("autocomplete"); +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let mut client = AutocompleteServiceClient::connect("http://[::1]:50051").await?; + + // Initialize with some test data + let init_request = InitRequest { + strings: vec![ + StringScore { text: "hello".to_string(), score: 1.0 }, + StringScore { text: "help".to_string(), score: 0.8 }, + StringScore { text: "hell".to_string(), score: 0.6 }, + ], + }; + + let response = client.init(init_request).await?; + println!("INIT RESPONSE: {:?}", response); + + // Get completions + let request = CompleteRequest { + prefix: "hel".to_string(), + max_results: 10, + }; + + let response = client.complete(request).await?; + println!("COMPLETE RESPONSE: {:?}", response); + + Ok(()) +} \ No newline at end of file diff --git a/autocomplete-rs/schema/schema.graphql b/autocomplete-rs/schema/schema.graphql new file mode 100644 index 0000000..70da230 --- /dev/null +++ b/autocomplete-rs/schema/schema.graphql @@ -0,0 +1,41 @@ +type Query { + # Get completions for a prefix + complete(prefix: String!, maxResults: Int): CompleteResponse! + + # Get system statistics + stats: Stats! +} + +type Mutation { + # Initialize the autocomplete system with strings and scores + init(strings: [StringScoreInput!]!): InitResponse! +} + +# Input type for string with score +input StringScoreInput { + text: String! + score: Float! +} + +# Response type for completions +type CompleteResponse { + completions: [Completion!]! +} + +# A single completion result +type Completion { + text: String! + score: Float! +} + +# Response type for initialization +type InitResponse { + success: Boolean! + error: String +} + +# System statistics +type Stats { + numTerms: Int! + memoryBytes: Int! +} \ No newline at end of file diff --git a/autocomplete-rs/src/autocomplete.rs b/autocomplete-rs/src/autocomplete.rs new file mode 100644 index 0000000..bb9ffa6 --- /dev/null +++ b/autocomplete-rs/src/autocomplete.rs @@ -0,0 +1,45 @@ +use crate::types::ScoreType; +use crate::trie::Trie; +use super::dictionary::Dictionary; + +#[derive(Clone)] +pub struct Autocomplete { + trie: Trie, + dictionary: Dictionary, +} + +impl Autocomplete { + pub fn new() -> Self { + Self { + trie: Trie::new(), + dictionary: Dictionary::new(), + } + } + + pub fn init(&mut self, strings: &[(String, ScoreType)]) -> Result<(), String> { + for (string, score) in strings { + let id = self.dictionary.insert(string.clone()); + self.trie.insert(string, id, *score); + } + Ok(()) + } + + pub fn complete(&self, prefix: &str) -> Vec<(String, ScoreType)> { + let completions = self.trie.complete(prefix); + completions + .into_iter() + .filter_map(|(id, score)| { + self.dictionary.get(id).map(|text| (text.to_string(), score)) + }) + .collect() + } + + pub fn num_terms(&self) -> usize { + self.dictionary.len() + } + + pub fn bytes(&self) -> usize { + // TODO: Implement actual memory usage calculation + 0 + } +} \ No newline at end of file diff --git a/autocomplete-rs/src/constants.rs b/autocomplete-rs/src/constants.rs new file mode 100644 index 0000000..b949eb7 --- /dev/null +++ b/autocomplete-rs/src/constants.rs @@ -0,0 +1,8 @@ +// Constants for the autocomplete system +pub const MAX_K: u32 = 15; +pub const MAX_NUM_TERMS_PER_QUERY: u32 = 64; +pub const MAX_NUM_CHARS_PER_QUERY: u32 = 128; +pub const POOL_SIZE: usize = (MAX_K as usize) * (MAX_NUM_CHARS_PER_QUERY as usize); + +// Compile-time assertion +const _: () = assert!(MAX_NUM_TERMS_PER_QUERY < 256, "MAX_NUM_TERMS_PER_QUERY must be < 256"); \ No newline at end of file diff --git a/autocomplete-rs/src/dictionary.rs b/autocomplete-rs/src/dictionary.rs new file mode 100644 index 0000000..a09adda --- /dev/null +++ b/autocomplete-rs/src/dictionary.rs @@ -0,0 +1,46 @@ +use crate::types::IdType; + +#[derive(Clone)] +pub struct Dictionary { + strings: Vec, + id_map: std::collections::HashMap, + next_id: IdType, +} + +impl Dictionary { + pub fn new() -> Self { + Self { + strings: Vec::new(), + id_map: std::collections::HashMap::new(), + next_id: 0, + } + } + + pub fn insert(&mut self, string: String) -> IdType { + if let Some(&id) = self.id_map.get(&string) { + return id; + } + + let id = self.next_id; + self.next_id += 1; + self.strings.push(string.clone()); + self.id_map.insert(string, id); + id + } + + pub fn get(&self, id: IdType) -> Option<&str> { + self.strings.get(id as usize).map(|s| s.as_str()) + } + + pub fn get_id(&self, string: &str) -> Option { + self.id_map.get(string).copied() + } + + pub fn len(&self) -> usize { + self.strings.len() + } + + pub fn is_empty(&self) -> bool { + self.strings.is_empty() + } +} \ No newline at end of file diff --git a/autocomplete-rs/src/index.rs b/autocomplete-rs/src/index.rs new file mode 100644 index 0000000..29da316 --- /dev/null +++ b/autocomplete-rs/src/index.rs @@ -0,0 +1,182 @@ +use crate::types::{IdType, ScoreType}; +use crate::trie::Trie; +use crate::dictionary::Dictionary; + +#[derive(Clone)] +pub struct Index { + trie: Trie, + dictionary: Dictionary, +} + +impl Index { + pub fn new() -> Self { + Self { + trie: Trie::new(), + dictionary: Dictionary::new(), + } + } + + pub fn add_doc(&mut self, _doc_id: IdType, text: &str, score: ScoreType) { + let id = self.dictionary.insert(text.to_string()); + self.trie.insert(text, id, score); + } + + pub fn search(&self, prefix: &str) -> Vec<(IdType, ScoreType)> { + let completions = self.trie.complete(prefix); + completions + .into_iter() + .filter_map(|(id, score)| { + self.dictionary.get(id).map(|_| (id, score)) + }) + .collect() + } + + pub fn num_terms(&self) -> usize { + self.dictionary.len() + } + + pub fn bytes(&self) -> usize { + // TODO: Implement actual memory usage calculation + 0 + } +} + +/// Blocked inverted index for efficient document retrieval +pub struct BlockedInvertedIndex { + blocks: Vec>, + block_size: usize, +} + +impl BlockedInvertedIndex { + /// Create a new blocked inverted index + pub fn new(block_size: usize) -> Self { + Self { + blocks: Vec::new(), + block_size, + } + } + + /// Add a document to the index + pub fn insert(&mut self, id: IdType) { + if self.blocks.is_empty() || self.blocks.last().unwrap().len() >= self.block_size { + self.blocks.push(Vec::with_capacity(self.block_size)); + } + self.blocks.last_mut().unwrap().push(id); + } + + /// Get documents for a term + pub fn get(&self, block_id: usize) -> Option<&[IdType]> { + self.blocks.get(block_id).map(|v| v.as_slice()) + } + + /// Get the number of blocks + pub fn num_blocks(&self) -> usize { + self.blocks.len() + } + + /// Get the block size + pub fn block_size(&self) -> usize { + self.block_size + } +} + +/// Compact vector for efficient storage +pub struct CompactVector { + data: Vec, + element_size: usize, + num_elements: usize, +} + +impl CompactVector { + /// Create a new compact vector + pub fn new(element_size: usize) -> Self { + Self { + data: Vec::new(), + element_size, + num_elements: 0, + } + } + + /// Add an element to the vector + pub fn push(&mut self, element: &[u8]) { + assert_eq!(element.len(), self.element_size); + self.data.extend_from_slice(element); + self.num_elements += 1; + } + + /// Get an element from the vector + pub fn get(&self, index: usize) -> Option<&[u8]> { + if index >= self.num_elements { + return None; + } + let start = index * self.element_size; + let end = start + self.element_size; + Some(&self.data[start..end]) + } + + /// Get the number of elements + pub fn size(&self) -> usize { + self.num_elements + } + + /// Get the size in bytes + pub fn bytes(&self) -> usize { + self.data.len() + } +} + +/// Bit vector for efficient bit-level operations +pub struct BitVector { + data: Vec, + num_bits: usize, +} + +impl BitVector { + /// Create a new bit vector + pub fn new(num_bits: usize) -> Self { + let num_bytes = (num_bits + 7) / 8; + Self { + data: vec![0; num_bytes], + num_bits, + } + } + + /// Set a bit + pub fn set(&mut self, index: usize) { + if index < self.num_bits { + let byte_idx = index / 8; + let bit_idx = index % 8; + self.data[byte_idx] |= 1 << bit_idx; + } + } + + /// Clear a bit + pub fn clear(&mut self, index: usize) { + if index < self.num_bits { + let byte_idx = index / 8; + let bit_idx = index % 8; + self.data[byte_idx] &= !(1 << bit_idx); + } + } + + /// Test a bit + pub fn test(&self, index: usize) -> bool { + if index < self.num_bits { + let byte_idx = index / 8; + let bit_idx = index % 8; + (self.data[byte_idx] & (1 << bit_idx)) != 0 + } else { + false + } + } + + /// Get the number of bits + pub fn size(&self) -> usize { + self.num_bits + } + + /// Get the size in bytes + pub fn bytes(&self) -> usize { + self.data.len() + } +} \ No newline at end of file diff --git a/autocomplete-rs/src/lib.rs b/autocomplete-rs/src/lib.rs new file mode 100644 index 0000000..70048c6 --- /dev/null +++ b/autocomplete-rs/src/lib.rs @@ -0,0 +1,19 @@ +pub mod dictionary; +pub mod types; +pub mod trie; +pub mod constants; +pub mod parameters; +pub mod probe; +pub mod string_pool; +pub mod index; +pub mod autocomplete; + +pub use dictionary::Dictionary; +pub use types::*; +pub use trie::*; +pub use constants::*; +pub use parameters::*; +pub use probe::*; +pub use string_pool::*; +pub use index::*; +pub use autocomplete::*; \ No newline at end of file diff --git a/autocomplete-rs/src/main.rs b/autocomplete-rs/src/main.rs new file mode 100644 index 0000000..751189f --- /dev/null +++ b/autocomplete-rs/src/main.rs @@ -0,0 +1,25 @@ +use std::error::Error; +use clap::Parser; + +/// Autocomplete service +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + /// Input file path + #[arg(short, long)] + input: Option, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let args = Args::parse(); + + println!("Autocomplete Service"); + + if let Some(input) = args.input { + println!("Processing input file: {}", input); + // TODO: Implement file processing logic + } + + Ok(()) +} diff --git a/autocomplete-rs/src/parameters.rs b/autocomplete-rs/src/parameters.rs new file mode 100644 index 0000000..38d5fec --- /dev/null +++ b/autocomplete-rs/src/parameters.rs @@ -0,0 +1,115 @@ +use std::fs::File; +use std::io::{self, BufRead, BufReader}; +use std::path::Path; + +use crate::constants::{MAX_NUM_CHARS_PER_QUERY, MAX_NUM_TERMS_PER_QUERY}; + +/// Parameters for the autocomplete system +#[derive(Debug, Default)] +pub struct Parameters { + pub num_terms: u32, + pub max_string_length: u32, + pub num_completions: u32, + pub universe: u32, + pub num_levels: u32, + pub nodes_per_level: Vec, + pub collection_basename: String, +} + +impl Parameters { + /// Creates a new empty Parameters instance + pub fn new() -> Self { + Self::default() + } + + /// Loads parameters from a statistics file + pub fn load(&mut self) -> io::Result<()> { + let stats_path = if self.collection_basename.ends_with(".mapped.stats") { + Path::new(&self.collection_basename).to_path_buf() + } else { + Path::new(&self.collection_basename).with_extension("mapped.stats") + }; + + let file = File::open(stats_path)?; + let reader = BufReader::new(file); + let mut lines = reader.lines(); + + // Read basic statistics + self.num_terms = lines.next() + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing num_terms"))?? + .parse() + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + self.max_string_length = lines.next() + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing max_string_length"))?? + .parse() + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + self.num_completions = lines.next() + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing num_completions"))?? + .parse() + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + self.universe = lines.next() + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing universe"))?? + .parse() + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + self.num_levels = lines.next() + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing num_levels"))?? + .parse() + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + // Validate basic statistics + if self.num_terms == 0 { + return Err(io::Error::new(io::ErrorKind::InvalidData, "num_terms must be > 0")); + } + if self.max_string_length == 0 { + return Err(io::Error::new(io::ErrorKind::InvalidData, "max_string_length must be > 0")); + } + if self.num_completions == 0 { + return Err(io::Error::new(io::ErrorKind::InvalidData, "num_completions must be > 0")); + } + if self.universe < self.num_completions { + return Err(io::Error::new(io::ErrorKind::InvalidData, "universe must be >= num_completions")); + } + if self.num_levels == 0 { + return Err(io::Error::new(io::ErrorKind::InvalidData, "num_levels must be > 0")); + } + + // Validate against constants + if self.max_string_length > MAX_NUM_CHARS_PER_QUERY { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("max_string_length ({}) exceeds MAX_NUM_CHARS_PER_QUERY ({})", + self.max_string_length, MAX_NUM_CHARS_PER_QUERY) + )); + } + if self.num_levels > MAX_NUM_TERMS_PER_QUERY { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("num_levels ({}) exceeds MAX_NUM_TERMS_PER_QUERY ({})", + self.num_levels, MAX_NUM_TERMS_PER_QUERY) + )); + } + + // Read nodes per level + self.nodes_per_level = Vec::with_capacity(self.num_levels as usize); + for _ in 0..self.num_levels { + let count = lines.next() + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing nodes_per_level data"))?? + .parse() + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + self.nodes_per_level.push(count); + } + + if self.nodes_per_level.len() != self.num_levels as usize { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "File with statistics may be truncated or malformed" + )); + } + + Ok(()) + } +} \ No newline at end of file diff --git a/autocomplete-rs/src/probe.rs b/autocomplete-rs/src/probe.rs new file mode 100644 index 0000000..c82f825 --- /dev/null +++ b/autocomplete-rs/src/probe.rs @@ -0,0 +1,81 @@ +use std::time::{Duration, Instant}; + +/// A trait for performance measurement probes +pub trait Probe { + /// Start timing an operation + fn start(&mut self, id: u64); + /// Stop timing an operation + fn stop(&mut self, id: u64); +} + +/// A no-operation probe that does nothing +#[derive(Debug, Default)] +pub struct NopProbe; + +impl Probe for NopProbe { + fn start(&mut self, _id: u64) {} + fn stop(&mut self, _id: u64) {} +} + +/// A timer probe that measures operation durations +#[derive(Debug)] +pub struct TimerProbe { + timers: Vec, +} + +#[derive(Debug, Default, Clone)] +struct Timer { + start_time: Option, + total_duration: Duration, +} + +impl Timer { + fn new() -> Self { + Self { + start_time: None, + total_duration: Duration::default(), + } + } + + fn start(&mut self) { + self.start_time = Some(Instant::now()); + } + + fn stop(&mut self) { + if let Some(start) = self.start_time { + self.total_duration += start.elapsed(); + self.start_time = None; + } + } + + fn get_duration(&self) -> Duration { + self.total_duration + } +} + +impl TimerProbe { + /// Creates a new TimerProbe with the specified number of timers + pub fn new(num_timers: u64) -> Self { + Self { + timers: vec![Timer::new(); num_timers as usize], + } + } + + /// Gets the total duration for a specific timer + pub fn get_duration(&self, id: u64) -> Duration { + assert!(id < self.timers.len() as u64, "Timer ID out of bounds"); + self.timers[id as usize].get_duration() + } +} + +impl Probe for TimerProbe { + fn start(&mut self, id: u64) { + assert!(id < self.timers.len() as u64, "Timer ID out of bounds"); + self.timers[id as usize].start(); + } + + fn stop(&mut self, id: u64) { + assert!(id < self.timers.len() as u64, "Timer ID out of bounds"); + self.timers[id as usize].stop(); + } +} \ No newline at end of file diff --git a/autocomplete-rs/src/string_pool.rs b/autocomplete-rs/src/string_pool.rs new file mode 100644 index 0000000..332645a --- /dev/null +++ b/autocomplete-rs/src/string_pool.rs @@ -0,0 +1,157 @@ +use crate::types::{ByteRange, IdType}; + +/// Represents a scored byte range +#[derive(Debug, Clone)] +pub struct ScoredByteRange { + pub string: ByteRange, + pub score: IdType, +} + +/// Manages a pool of scored strings +pub struct ScoredStringPool { + data: Vec, + offsets: Vec, + scores: Vec, +} + +impl ScoredStringPool { + /// Create a new empty string pool + pub fn new() -> Self { + Self { + data: Vec::new(), + offsets: vec![0], + scores: Vec::new(), + } + } + + /// Initialize the pool + pub fn init(&mut self) { + self.push_back_offset(0); + } + + /// Resize the pool + pub fn resize(&mut self, num_bytes: usize, k: u32) { + self.scores.resize(k as usize, 0.0); + self.data.resize(num_bytes, 0); + } + + /// Clear the pool + pub fn clear(&mut self) { + self.offsets.clear(); + } + + /// Get the number of strings in the pool + pub fn size(&self) -> usize { + assert!(!self.offsets.is_empty()); + self.offsets.len() - 1 + } + + /// Get the total number of bytes used + pub fn bytes(&self) -> usize { + std::mem::size_of_val(&self.data) + + std::mem::size_of_val(&self.offsets) + + std::mem::size_of_val(&self.scores) + } + + /// Get a mutable reference to the data + pub fn data_mut(&mut self) -> &mut [u8] { + &mut self.data + } + + /// Add a new offset + pub fn push_back_offset(&mut self, offset: usize) { + self.offsets.push(offset); + } + + /// Get a mutable reference to the scores + pub fn scores_mut(&mut self) -> &mut [f32] { + &mut self.scores + } + + /// Get a reference to the scores + pub fn scores(&self) -> &[f32] { + &self.scores + } + + /// Get a scored byte range at the given index + pub fn get(&self, index: usize) -> ByteRange { + if index >= self.offsets.len() - 1 { + return ByteRange::new(0, 0); + } + ByteRange::new( + self.offsets[index], + self.offsets[index + 1] + ) + } + + /// Set the offsets vector + pub fn set_offsets(&mut self, offsets: Vec) { + self.offsets = offsets; + } + + /// Set the scores vector + pub fn set_scores(&mut self, scores: Vec) { + self.scores = scores; + } + + /// Set the data vector + pub fn set_data(&mut self, data: Vec) { + self.data = data; + } + + pub fn get_score(&self, index: usize) -> f32 { + self.scores.get(index).copied().unwrap_or(0.0) + } +} + +/// Iterator over scored strings in the pool +pub struct ScoredStringPoolIterator<'a> { + pool: &'a ScoredStringPool, + pos: usize, +} + +impl<'a> ScoredStringPoolIterator<'a> { + /// Create a new iterator + pub fn new(pool: &'a ScoredStringPool, pos: usize) -> Self { + Self { pool, pos } + } + + /// Check if the iterator is empty + pub fn empty(&self) -> bool { + self.size() == 0 + } + + /// Get the number of strings + pub fn size(&self) -> usize { + self.pool.size() + } + + /// Get the pool + pub fn pool(&self) -> &ScoredStringPool { + self.pool + } +} + +impl<'a> Iterator for ScoredStringPoolIterator<'a> { + type Item = ScoredByteRange; + + fn next(&mut self) -> Option { + if self.pos < self.pool.size() { + let item = ScoredByteRange { + string: self.pool.get(self.pos), + score: self.pool.get_score(self.pos) as IdType, + }; + self.pos += 1; + Some(item) + } else { + None + } + } +} + +impl ScoredStringPool { + /// Get an iterator over the scored strings + pub fn iter(&self) -> ScoredStringPoolIterator { + ScoredStringPoolIterator::new(self, 0) + } +} \ No newline at end of file diff --git a/autocomplete-rs/src/trie.rs b/autocomplete-rs/src/trie.rs new file mode 100644 index 0000000..05f80e5 --- /dev/null +++ b/autocomplete-rs/src/trie.rs @@ -0,0 +1,147 @@ +use std::collections::HashMap; +use crate::types::IdType; + +#[derive(Default, Clone)] +struct TrieNode { + children: HashMap>, + id: Option, + score: f32, +} + +impl TrieNode { + fn new() -> Self { + Self { + children: HashMap::new(), + id: None, + score: 0.0, + } + } + + fn is_terminal(&self) -> bool { + self.id.is_some() + } +} + +#[derive(Clone)] +pub struct Trie { + root: TrieNode, +} + +impl Trie { + pub fn new() -> Self { + Self { + root: TrieNode::new(), + } + } + + pub fn insert(&mut self, completion: &str, id: IdType, score: f32) { + let mut current = &mut self.root; + let chars: Vec = completion.chars().collect(); + + for &c in &chars { + current = current.children + .entry(c) + .or_insert_with(|| Box::new(TrieNode::new())); + } + + current.id = Some(id); + current.score = score; + } + + pub fn remove(&mut self, completion: &str) -> bool { + let mut path = Vec::new(); + let mut current = &mut self.root; + + // First pass: find the path to the node + for c in completion.chars() { + if let Some(next) = current.children.get_mut(&c) { + path.push(c); + current = next; + } else { + return false; // String not found + } + } + + // If the node is not a terminal, the string wasn't in the trie + if !current.is_terminal() { + return false; + } + + // Remove the terminal marker + current.id = None; + current.score = 0.0; + + // Second pass: remove empty nodes + let mut current = &mut self.root; + for &c in &path[..path.len()-1] { + current = current.children.get_mut(&c).unwrap(); + } + + // Remove the last node if it's empty + if current.children.is_empty() && !current.is_terminal() { + current.children.remove(&path[path.len()-1]); + } + + true + } + + pub fn complete(&self, prefix: &str) -> Vec<(IdType, f32)> { + let mut current = &self.root; + + // Navigate to the prefix node + for c in prefix.chars() { + if let Some(next) = current.children.get(&c) { + current = next; + } else { + return Vec::new(); // Prefix not found + } + } + + // Collect all completions from this node + let mut results = Vec::new(); + self.collect_completions(current, &mut results); + results + } + + fn collect_completions(&self, node: &TrieNode, results: &mut Vec<(IdType, f32)>) { + if let Some(id) = node.id { + results.push((id, node.score)); + } + + for child in node.children.values() { + self.collect_completions(child, results); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_trie_insert_and_complete() { + let mut trie = Trie::new(); + trie.insert("hello", 1, 1.0); + trie.insert("help", 2, 0.8); + trie.insert("world", 3, 0.5); + + let completions = trie.complete("hel"); + assert_eq!(completions.len(), 2); + assert!(completions.contains(&(1, 1.0))); + assert!(completions.contains(&(2, 0.8))); + } + + #[test] + fn test_trie_remove() { + let mut trie = Trie::new(); + trie.insert("hello", 1, 1.0); + trie.insert("help", 2, 0.8); + + assert!(trie.remove("hello")); + assert!(!trie.remove("hello")); // Already removed + assert!(trie.remove("help")); + + let completions = trie.complete("hel"); + assert_eq!(completions.len(), 0); + } +} \ No newline at end of file diff --git a/autocomplete-rs/src/types.rs b/autocomplete-rs/src/types.rs new file mode 100644 index 0000000..cbd9316 --- /dev/null +++ b/autocomplete-rs/src/types.rs @@ -0,0 +1,104 @@ +/// Type alias for document and term IDs +pub type IdType = u32; + +/// Type alias for completion type (vector of term IDs) +pub type CompletionType = Vec; + +/// Type alias for score type +pub type ScoreType = f32; + +/// Represents a range of values +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ValueRange { + pub begin: u64, + pub end: u64, +} + +impl ValueRange { + /// Check if the range is invalid + pub fn is_invalid(&self) -> bool { + self.begin == u64::MAX || self.end == u64::MAX || self.begin > self.end + } + + /// Check if the range is valid + pub fn is_valid(&self) -> bool { + !self.is_invalid() + } + + /// Check if a value is contained in the range + pub fn contains(&self, val: u64) -> bool { + val >= self.begin && val <= self.end + } +} + +/// Represents a scored range +#[derive(Debug, Clone)] +pub struct ScoredRange { + pub range: ValueRange, + pub min_pos: u32, + pub min_val: IdType, +} + +impl ScoredRange { + /// Compare two scored ranges + pub fn greater(l: &ScoredRange, r: &ScoredRange) -> bool { + l.min_val > r.min_val + } +} + +/// Represents a byte range +#[derive(Debug, Clone, Copy)] +pub struct ByteRange { + pub start: usize, + pub end: usize, +} + +impl ByteRange { + pub fn new(start: usize, end: usize) -> Self { + Self { start, end } + } + + pub fn len(&self) -> usize { + self.end - self.start + } +} + +/// Represents a range of 32-bit integers +#[derive(Debug, Clone, Copy)] +pub struct Uint32Range { + pub begin: *const u32, + pub end: *const u32, +} + +/// Global constants +pub mod global { + use super::IdType; + + /// Invalid term ID + pub const INVALID_TERM_ID: IdType = IdType::MAX; + + /// Terminator value + pub const TERMINATOR: IdType = 0; + + /// Not found value + pub const NOT_FOUND: u64 = u64::MAX; + + /// Linear scan threshold + pub const LINEAR_SCAN_THRESHOLD: u64 = 8; +} + +/// Convert a string to a byte range +pub fn string_to_byte_range(s: &str) -> ByteRange { + ByteRange { + start: 0, + end: s.len(), + } +} + +/// Convert a completion to a uint32 range +pub fn completion_to_uint32_range(c: &CompletionType) -> Uint32Range { + Uint32Range { + begin: c.as_ptr(), + end: unsafe { c.as_ptr().add(c.len()) }, + } +} \ No newline at end of file diff --git a/autocomplete-rs/tests/constants_tests.rs b/autocomplete-rs/tests/constants_tests.rs new file mode 100644 index 0000000..94123cc --- /dev/null +++ b/autocomplete-rs/tests/constants_tests.rs @@ -0,0 +1,21 @@ +use autocomplete_rs::constants::*; + +#[test] +fn test_constants() { + // Test MAX_K + assert!(MAX_K > 0, "MAX_K should be positive"); + assert!(MAX_K <= 100, "MAX_K should be reasonably small"); + + // Test MAX_NUM_TERMS_PER_QUERY + assert!(MAX_NUM_TERMS_PER_QUERY > 0, "MAX_NUM_TERMS_PER_QUERY should be positive"); + assert!(MAX_NUM_TERMS_PER_QUERY < 256, "MAX_NUM_TERMS_PER_QUERY must be < 256"); + + // Test MAX_NUM_CHARS_PER_QUERY + assert!(MAX_NUM_CHARS_PER_QUERY > 0, "MAX_NUM_CHARS_PER_QUERY should be positive"); + assert!(MAX_NUM_CHARS_PER_QUERY >= MAX_K, "MAX_NUM_CHARS_PER_QUERY should be >= MAX_K"); + + // Test POOL_SIZE + assert!(POOL_SIZE > 0, "POOL_SIZE should be positive"); + assert_eq!(POOL_SIZE, (MAX_K as usize) * (MAX_NUM_CHARS_PER_QUERY as usize), + "POOL_SIZE should be MAX_K * MAX_NUM_CHARS_PER_QUERY"); +} \ No newline at end of file diff --git a/autocomplete-rs/tests/dictionary_tests.rs b/autocomplete-rs/tests/dictionary_tests.rs new file mode 100644 index 0000000..1aab6d8 --- /dev/null +++ b/autocomplete-rs/tests/dictionary_tests.rs @@ -0,0 +1,119 @@ +use autocomplete_rs::dictionary::Dictionary; +use autocomplete_rs::types::IdType; + +#[test] +fn test_dictionary_new() { + let dict = Dictionary::new(); + assert!(dict.is_empty()); + assert_eq!(dict.len(), 0); +} + +#[test] +fn test_dictionary_insert() { + let mut dict = Dictionary::new(); + + // Test first insertion + let id1 = dict.insert("hello".to_string()); + assert_eq!(id1, 0); + assert_eq!(dict.len(), 1); + + // Test duplicate insertion + let id2 = dict.insert("hello".to_string()); + assert_eq!(id2, id1); + assert_eq!(dict.len(), 1); + + // Test new insertion + let id3 = dict.insert("world".to_string()); + assert_eq!(id3, 1); + assert_eq!(dict.len(), 2); +} + +#[test] +fn test_dictionary_get() { + let mut dict = Dictionary::new(); + + // Insert test data + let id1 = dict.insert("hello".to_string()); + let id2 = dict.insert("world".to_string()); + + // Test valid gets + assert_eq!(dict.get(id1), Some("hello")); + assert_eq!(dict.get(id2), Some("world")); + + // Test invalid id + assert_eq!(dict.get(999), None); +} + +#[test] +fn test_dictionary_get_id() { + let mut dict = Dictionary::new(); + + // Insert test data + let id1 = dict.insert("hello".to_string()); + let id2 = dict.insert("world".to_string()); + + // Test valid gets + assert_eq!(dict.get_id("hello"), Some(id1)); + assert_eq!(dict.get_id("world"), Some(id2)); + + // Test non-existent string + assert_eq!(dict.get_id("nonexistent"), None); +} + +#[test] +fn test_dictionary_len_and_empty() { + let mut dict = Dictionary::new(); + + // Test empty state + assert!(dict.is_empty()); + assert_eq!(dict.len(), 0); + + // Test after insertions + dict.insert("hello".to_string()); + assert!(!dict.is_empty()); + assert_eq!(dict.len(), 1); + + dict.insert("world".to_string()); + assert!(!dict.is_empty()); + assert_eq!(dict.len(), 2); + + // Test duplicate insertion doesn't change length + dict.insert("hello".to_string()); + assert_eq!(dict.len(), 2); +} + +#[test] +fn test_dictionary_id_sequence() { + let mut dict = Dictionary::new(); + + // Test that IDs are assigned sequentially + let id1 = dict.insert("first".to_string()); + let id2 = dict.insert("second".to_string()); + let id3 = dict.insert("third".to_string()); + + assert_eq!(id1, 0); + assert_eq!(id2, 1); + assert_eq!(id3, 2); +} + +#[test] +fn test_dictionary_large_insertions() { + let mut dict = Dictionary::new(); + let num_insertions = 1000; + + // Insert many strings + for i in 0..num_insertions { + let s = format!("string_{}", i); + let id = dict.insert(s); + assert_eq!(id, i as IdType); + } + + assert_eq!(dict.len(), num_insertions); + + // Verify all strings can be retrieved + for i in 0..num_insertions { + let s = format!("string_{}", i); + assert_eq!(dict.get(i as IdType), Some(s.as_str())); + assert_eq!(dict.get_id(&s), Some(i as IdType)); + } +} \ No newline at end of file diff --git a/autocomplete-rs/tests/parameters_tests.rs b/autocomplete-rs/tests/parameters_tests.rs new file mode 100644 index 0000000..2bd6762 --- /dev/null +++ b/autocomplete-rs/tests/parameters_tests.rs @@ -0,0 +1,98 @@ +use std::fs::File; +use std::io::Write; +use std::path::Path; +use tempfile::NamedTempFile; +use autocomplete_rs::parameters::Parameters; +use autocomplete_rs::constants::{MAX_NUM_CHARS_PER_QUERY, MAX_NUM_TERMS_PER_QUERY}; + +fn create_test_stats_file() -> NamedTempFile { + let mut file = NamedTempFile::new().unwrap(); + writeln!(file, "1000").unwrap(); // num_terms + writeln!(file, "50").unwrap(); // max_string_length + writeln!(file, "500").unwrap(); // num_completions + writeln!(file, "1000").unwrap(); // universe + writeln!(file, "3").unwrap(); // num_levels + writeln!(file, "100").unwrap(); // nodes_per_level[0] + writeln!(file, "200").unwrap(); // nodes_per_level[1] + writeln!(file, "300").unwrap(); // nodes_per_level[2] + file +} + +#[test] +fn test_parameters_load_valid() { + let test_file = create_test_stats_file(); + let mut params = Parameters::new(); + let path = test_file.path().to_str().unwrap().to_string(); + println!("Test file path: {}", path); + params.collection_basename = path; + + match params.load() { + Ok(_) => println!("Load succeeded"), + Err(e) => println!("Load failed: {}", e), + } + + assert!(params.load().is_ok()); + assert_eq!(params.num_terms, 1000); + assert_eq!(params.max_string_length, 50); + assert_eq!(params.num_completions, 500); + assert_eq!(params.universe, 1000); + assert_eq!(params.num_levels, 3); + assert_eq!(params.nodes_per_level, vec![100, 200, 300]); +} + +#[test] +fn test_parameters_load_invalid_file() { + let mut params = Parameters::new(); + params.collection_basename = "nonexistent_file".to_string(); + assert!(params.load().is_err()); +} + +#[test] +fn test_parameters_load_invalid_data() { + let mut file = NamedTempFile::new().unwrap(); + writeln!(file, "0").unwrap(); // invalid num_terms + writeln!(file, "50").unwrap(); + writeln!(file, "500").unwrap(); + writeln!(file, "1000").unwrap(); + writeln!(file, "3").unwrap(); + writeln!(file, "100").unwrap(); + writeln!(file, "200").unwrap(); + writeln!(file, "300").unwrap(); + + let mut params = Parameters::new(); + params.collection_basename = file.path().to_str().unwrap().to_string(); + assert!(params.load().is_err()); +} + +#[test] +fn test_parameters_load_invalid_constants() { + let mut file = NamedTempFile::new().unwrap(); + writeln!(file, "1000").unwrap(); + writeln!(file, "{}", MAX_NUM_CHARS_PER_QUERY + 1).unwrap(); // exceeds MAX_NUM_CHARS_PER_QUERY + writeln!(file, "500").unwrap(); + writeln!(file, "1000").unwrap(); + writeln!(file, "3").unwrap(); + writeln!(file, "100").unwrap(); + writeln!(file, "200").unwrap(); + writeln!(file, "300").unwrap(); + + let mut params = Parameters::new(); + params.collection_basename = file.path().to_str().unwrap().to_string(); + assert!(params.load().is_err()); +} + +#[test] +fn test_parameters_load_truncated() { + let mut file = NamedTempFile::new().unwrap(); + writeln!(file, "1000").unwrap(); + writeln!(file, "50").unwrap(); + writeln!(file, "500").unwrap(); + writeln!(file, "1000").unwrap(); + writeln!(file, "3").unwrap(); + writeln!(file, "100").unwrap(); + // Missing nodes_per_level entries + + let mut params = Parameters::new(); + params.collection_basename = file.path().to_str().unwrap().to_string(); + assert!(params.load().is_err()); +} \ No newline at end of file diff --git a/autocomplete-rs/tests/probe_tests.rs b/autocomplete-rs/tests/probe_tests.rs new file mode 100644 index 0000000..7e869e1 --- /dev/null +++ b/autocomplete-rs/tests/probe_tests.rs @@ -0,0 +1,79 @@ +use std::thread; +use std::time::Duration; +use autocomplete_rs::probe::{Probe, NopProbe, TimerProbe}; + +#[test] +fn test_nop_probe() { + let mut probe = NopProbe; + // These should not panic + probe.start(0); + probe.stop(0); +} + +#[test] +fn test_timer_probe_single() { + let mut probe = TimerProbe::new(1); + + probe.start(0); + thread::sleep(Duration::from_millis(100)); + probe.stop(0); + + let duration = probe.get_duration(0); + assert!(duration >= Duration::from_millis(100)); +} + +#[test] +fn test_timer_probe_multiple() { + let mut probe = TimerProbe::new(3); + + // Timer 0 + probe.start(0); + thread::sleep(Duration::from_millis(100)); + probe.stop(0); + + // Timer 1 + probe.start(1); + thread::sleep(Duration::from_millis(200)); + probe.stop(1); + + // Timer 2 + probe.start(2); + thread::sleep(Duration::from_millis(300)); + probe.stop(2); + + assert!(probe.get_duration(0) >= Duration::from_millis(100)); + assert!(probe.get_duration(1) >= Duration::from_millis(200)); + assert!(probe.get_duration(2) >= Duration::from_millis(300)); +} + +#[test] +fn test_timer_probe_accumulation() { + let mut probe = TimerProbe::new(1); + + // First interval + probe.start(0); + thread::sleep(Duration::from_millis(100)); + probe.stop(0); + + // Second interval + probe.start(0); + thread::sleep(Duration::from_millis(100)); + probe.stop(0); + + let duration = probe.get_duration(0); + assert!(duration >= Duration::from_millis(200)); +} + +#[test] +#[should_panic(expected = "Timer ID out of bounds")] +fn test_timer_probe_invalid_id() { + let mut probe = TimerProbe::new(1); + probe.start(1); // Should panic as we only have timer 0 +} + +#[test] +#[should_panic(expected = "Timer ID out of bounds")] +fn test_timer_probe_get_invalid_id() { + let probe = TimerProbe::new(1); + probe.get_duration(1); // Should panic as we only have timer 0 +} \ No newline at end of file diff --git a/benchmark/benchmark_common.hpp b/benchmark/benchmark_common.hpp deleted file mode 100644 index 0fdae98..0000000 --- a/benchmark/benchmark_common.hpp +++ /dev/null @@ -1,33 +0,0 @@ -#pragma once - -namespace autocomplete { - -static const uint32_t runs = 5; - -size_t load_queries(std::vector& queries, uint32_t max_num_queries, - float percentage, std::istream& is = std::cin) { - assert(percentage >= 0.0 and percentage <= 1.0); - std::string line; - queries.reserve(max_num_queries); - for (uint32_t i = 0; i != max_num_queries; ++i) { - if (!std::getline(is, line)) break; - - auto query = line.substr(line.find(' ') + 1, line.size()); - int32_t size = query.size() - 1; - while (size >= 0 and query[size] != ' ') --size; - auto last_token = query.substr(size + 1, query.size() - size); - uint32_t num_chars = - last_token.size() - std::ceil(last_token.size() * percentage); - char first = last_token.front(); - for (uint32_t i = 0; i != num_chars; ++i) last_token.pop_back(); - - // retain at least one char - if (last_token.empty()) last_token.push_back(first); - assert(last_token.size() > 0); - - queries.push_back(query.substr(0, size + 1) + last_token); - } - return queries.size(); -} - -} // namespace autocomplete \ No newline at end of file diff --git a/benchmark/benchmark_conjunctive_topk.cpp b/benchmark/benchmark_conjunctive_topk.cpp deleted file mode 100644 index 2a04c4c..0000000 --- a/benchmark/benchmark_conjunctive_topk.cpp +++ /dev/null @@ -1,113 +0,0 @@ -#include - -#include "types.hpp" -#include "statistics.hpp" -#include "benchmark_common.hpp" - -using namespace autocomplete; - -template -void benchmark_conjunctive_topk(char const* binary_filename, uint32_t k, - uint32_t max_num_queries, - essentials::json_lines& breakdowns, - bool breakdown) { - Index autocomp; - essentials::logger("loading data structure from disk..."); - essentials::load(autocomp, binary_filename); - essentials::logger("DONE"); - autocomp.print_stats(); - - std::vector queries; - essentials::logger("loading queries..."); - uint32_t num_queries = - load_queries(queries, max_num_queries, 0.25, std::cin); - essentials::logger("loaded " + std::to_string(num_queries) + " queries"); - - auto ns_x_query = [&](double time) { - return uint64_t(time / (runs * num_queries) * 1000); - }; - - essentials::logger("benchmarking conjunctive_topk queries..."); - uint64_t reported_strings = 0; - - if (breakdown) { - std::vector timers(4); - for (uint32_t run = 0; run != runs; ++run) { - for (auto const& query : queries) { - auto it = autocomp.conjunctive_topk(query, k, timers); - reported_strings += it.size(); - } - } - essentials::logger("DONE"); - std::cout << reported_strings << std::endl; - breakdowns.add("num_queries", std::to_string(num_queries)); - breakdowns.add("parsing_ns_per_query", - std::to_string(ns_x_query(timers[0].elapsed()))); - breakdowns.add("dictionary_search_ns_per_query", - std::to_string(ns_x_query(timers[1].elapsed()))); - breakdowns.add("conjunctive_search_ns_per_query", - std::to_string(ns_x_query(timers[2].elapsed()))); - breakdowns.add("reporting_ns_per_query", - std::to_string(ns_x_query(timers[3].elapsed()))); - } else { - essentials::timer_type timer; - timer.start(); - for (uint32_t run = 0; run != runs; ++run) { - for (auto const& query : queries) { - auto it = autocomp.conjunctive_topk(query, k); - reported_strings += it.size(); - } - } - timer.stop(); - essentials::logger("DONE"); - std::cout << reported_strings << std::endl; - breakdowns.add("num_queries", std::to_string(num_queries)); - breakdowns.add("ns_per_query", - std::to_string(ns_x_query(timer.elapsed()))); - } -} - -int main(int argc, char** argv) { - int mandatory = 5; - if (argc < mandatory + 1) { - std::cout << argv[0] - << " " - " --breakdown < queries" - << std::endl; - return 1; - } - - std::string type(argv[1]); - uint32_t k = std::atoi(argv[2]); - char const* binary_filename = argv[3]; - std::string num_terms_per_query(argv[4]); - uint32_t max_num_queries = std::atoi(argv[5]); - - bool breakdown = false; - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "--breakdown") { - breakdown = true; - } - } - - essentials::json_lines breakdowns; - breakdowns.new_line(); - breakdowns.add("num_terms_per_query", num_terms_per_query); - - if (type == "type1") { - benchmark_conjunctive_topk( - binary_filename, k, max_num_queries, breakdowns, breakdown); - } else if (type == "type2") { - benchmark_conjunctive_topk( - binary_filename, k, max_num_queries, breakdowns, breakdown); - } else if (type == "type3") { - benchmark_conjunctive_topk( - binary_filename, k, max_num_queries, breakdowns, breakdown); - } else { - std::cout << "error: unknown type '" << type << "'" << std::endl; - return 1; - } - - breakdowns.print(); - return 0; -} \ No newline at end of file diff --git a/benchmark/benchmark_prefix_topk.cpp b/benchmark/benchmark_prefix_topk.cpp deleted file mode 100644 index 2149e03..0000000 --- a/benchmark/benchmark_prefix_topk.cpp +++ /dev/null @@ -1,109 +0,0 @@ -#include - -#include "types.hpp" -#include "statistics.hpp" -#include "benchmark_common.hpp" - -using namespace autocomplete; - -template -void benchmark_prefix_topk(char const* binary_filename, uint32_t k, - uint32_t max_num_queries, - essentials::json_lines& breakdowns, bool breakdown) { - Index autocomp; - essentials::logger("loading data structure from disk..."); - essentials::load(autocomp, binary_filename); - essentials::logger("DONE"); - autocomp.print_stats(); - - std::vector queries; - essentials::logger("loading queries..."); - uint32_t num_queries = - load_queries(queries, max_num_queries, 0.25, std::cin); - essentials::logger("loaded " + std::to_string(num_queries) + " queries"); - - auto ns_x_query = [&](double time) { - return uint64_t(time / (runs * num_queries) * 1000); - }; - - essentials::logger("benchmarking prefix_topk queries..."); - uint64_t reported_strings = 0; - - if (breakdown) { - std::vector timers(4); - for (uint32_t run = 0; run != runs; ++run) { - for (auto const& query : queries) { - auto it = autocomp.prefix_topk(query, k, timers); - reported_strings += it.size(); - } - } - essentials::logger("DONE"); - std::cout << reported_strings << std::endl; - breakdowns.add("num_queries", std::to_string(num_queries)); - breakdowns.add("parsing_ns_per_query", - std::to_string(ns_x_query(timers[0].elapsed()))); - breakdowns.add("completions_search_ns_per_query", - std::to_string(ns_x_query(timers[1].elapsed()))); - breakdowns.add("topk_rmq_ns_per_query", - std::to_string(ns_x_query(timers[2].elapsed()))); - breakdowns.add("reporting_ns_per_query", - std::to_string(ns_x_query(timers[3].elapsed()))); - } else { - essentials::timer_type timer; - timer.start(); - for (uint32_t run = 0; run != runs; ++run) { - for (auto const& query : queries) { - auto it = autocomp.prefix_topk(query, k); - reported_strings += it.size(); - } - } - timer.stop(); - essentials::logger("DONE"); - std::cout << reported_strings << std::endl; - breakdowns.add("num_queries", std::to_string(num_queries)); - breakdowns.add("ns_per_query", - std::to_string(ns_x_query(timer.elapsed()))); - } -} - -int main(int argc, char** argv) { - int mandatory = 5; - if (argc < mandatory + 1) { - std::cout << argv[0] - << " " - " --breakdown < queries" - << std::endl; - return 1; - } - - std::string type(argv[1]); - uint32_t k = std::atoi(argv[2]); - char const* binary_filename = argv[3]; - std::string num_terms_per_query(argv[4]); - uint32_t max_num_queries = std::atoi(argv[5]); - - bool breakdown = false; - for (int i = mandatory + 1; i != argc; ++i) { - if (std::string(argv[i]) == "--breakdown") { - breakdown = true; - } - } - - essentials::json_lines breakdowns; - breakdowns.new_line(); - breakdowns.add("num_terms_per_query", num_terms_per_query); - - if (type == "type1") { - benchmark_prefix_topk( - binary_filename, k, max_num_queries, breakdowns, breakdown); - } else if (type == "type2") { - benchmark_prefix_topk( - binary_filename, k, max_num_queries, breakdowns, breakdown); - } else { - std::cout << "error: unknown type '" << type << "'" << std::endl; - return 1; - } - - breakdowns.print(); - return 0; -} \ No newline at end of file diff --git a/benchmark/benchmark_topk.cpp b/benchmark/benchmark_topk.cpp deleted file mode 100644 index a294afe..0000000 --- a/benchmark/benchmark_topk.cpp +++ /dev/null @@ -1,115 +0,0 @@ -#include - -#include "types.hpp" -#include "statistics.hpp" -#include "benchmark_common.hpp" - -using namespace autocomplete; - -template -void benchmark_topk(char const* binary_filename, uint32_t k, - uint32_t max_num_queries, float keep, - essentials::json_lines& breakdowns, bool breakdown) { - Index index; - essentials::load(index, binary_filename); - - std::vector queries; - uint32_t num_queries = - load_queries(queries, max_num_queries, keep, std::cin); - - uint64_t reported_strings = 0; - auto musec_per_query = [&](double time) { - return time / (runs * num_queries); - }; - - breakdowns.add("num_queries", std::to_string(num_queries)); - - if (breakdown) { - std::vector timers(4); - for (uint32_t run = 0; run != runs; ++run) { - for (auto const& query : queries) { - auto it = index.topk(query, k, timers); - reported_strings += it.size(); - } - } - - std::cout << reported_strings << std::endl; - - breakdowns.add("parsing_musec_per_query", - std::to_string(musec_per_query(timers[0].elapsed()))); - breakdowns.add("prefix_search_musec_per_query", - std::to_string(musec_per_query(timers[1].elapsed()))); - breakdowns.add("conjunctive_search_musec_per_query", - std::to_string(musec_per_query(timers[2].elapsed()))); - breakdowns.add("reporting_musec_per_query", - std::to_string(musec_per_query(timers[3].elapsed()))); - - } else { - essentials::timer_type timer; - timer.start(); - for (uint32_t run = 0; run != runs; ++run) { - for (auto const& query : queries) { - auto it = index.topk(query, k); - reported_strings += it.size(); - } - } - timer.stop(); - - std::cout << reported_strings << std::endl; - - breakdowns.add("musec_per_query", - std::to_string(musec_per_query(timer.elapsed()))); - } -} - -int main(int argc, char** argv) { - int mandatory = 6; - if (argc < mandatory + 1) { - std::cout << argv[0] - << " " - " [--breakdown] < queries" - << std::endl; - std::cout << " is a float in [0,1] and specifies how much " - "we keep of the last token in a query " - << std::endl; - return 1; - } - - std::string type(argv[1]); - uint32_t k = std::atoi(argv[2]); - char const* binary_filename = argv[3]; - std::string num_terms_per_query(argv[4]); - uint32_t max_num_queries = std::atoi(argv[5]); - float keep = std::atof(argv[6]); - - bool breakdown = false; - for (int i = mandatory + 1; i != argc; ++i) { - if (std::string(argv[i]) == "--breakdown") { - breakdown = true; - } - } - - essentials::json_lines breakdowns; - breakdowns.new_line(); - breakdowns.add("num_terms_per_query", num_terms_per_query); - breakdowns.add("percentage", std::to_string(keep)); - - if (type == "ef_type1") { - benchmark_topk( - binary_filename, k, max_num_queries, keep, breakdowns, breakdown); - } else if (type == "ef_type2") { - benchmark_topk( - binary_filename, k, max_num_queries, keep, breakdowns, breakdown); - } else if (type == "ef_type3") { - benchmark_topk( - binary_filename, k, max_num_queries, keep, breakdowns, breakdown); - } else if (type == "ef_type4") { - benchmark_topk( - binary_filename, k, max_num_queries, keep, breakdowns, breakdown); - } else { - return 1; - } - - breakdowns.print(); - return 0; -} \ No newline at end of file diff --git a/doc/activity_diagram.md b/doc/activity_diagram.md new file mode 100644 index 0000000..993101b --- /dev/null +++ b/doc/activity_diagram.md @@ -0,0 +1,157 @@ +# Activity Diagrams + +This document provides activity diagrams for the main workflows in the autocomplete system. + +## System Initialization and Index Building + +```mermaid +graph TD + Start([Start]) --> LoadParams[Load Parameters] + LoadParams --> InitComponents[Initialize Components] + InitComponents --> BuildTrie[Build Completion Trie] + BuildTrie --> BuildDict[Build Front-Coded Dictionary] + BuildDict --> BuildIndex[Build Inverted Index] + BuildIndex --> BuildForwardIndex[Build Forward Index] + BuildForwardIndex --> End([End]) + + style Start fill:#f9f,stroke:#333,stroke-width:2px + style End fill:#f9f,stroke:#333,stroke-width:2px +``` + +## Autocomplete Query Processing + +```mermaid +graph TD + Start([Start]) --> InputQuery[Input Query] + InputQuery --> ParseQuery[Parse Query Terms] + ParseQuery --> CheckPrefix[Check Prefix in Trie] + + CheckPrefix -->|Prefix Found| GetCompletions[Get Completions] + CheckPrefix -->|No Prefix| ReturnEmpty[Return Empty Results] + + GetCompletions --> ScoreCompletions[Score Completions] + ScoreCompletions --> SortResults[Sort by Score] + SortResults --> ReturnResults[Return Top-K Results] + + ReturnEmpty --> End([End]) + ReturnResults --> End + + style Start fill:#f9f,stroke:#333,stroke-width:2px + style End fill:#f9f,stroke:#333,stroke-width:2px +``` + +## Search Operation Flow + +```mermaid +graph TD + Start([Start]) --> InputTerms[Input Search Terms] + InputTerms --> ParseTerms[Parse Search Terms] + ParseTerms --> LookupTerms[Lookup Terms in Dictionary] + + LookupTerms -->|All Terms Found| GetPostings[Get Posting Lists] + LookupTerms -->|Terms Not Found| ReturnEmpty[Return Empty Results] + + GetPostings --> IntersectLists[Intersect Posting Lists] + IntersectLists --> ScoreDocs[Score Documents] + ScoreDocs --> SortResults[Sort by Score] + SortResults --> ReturnResults[Return Top-K Results] + + ReturnEmpty --> End([End]) + ReturnResults --> End + + style Start fill:#f9f,stroke:#333,stroke-width:2px +``` + +## String Pool Management + +```mermaid +graph TD + Start([Start]) --> CheckCapacity[Check Pool Capacity] + CheckCapacity -->|Full| RemoveLowest[Remove Lowest Score] + CheckCapacity -->|Space Available| AddString[Add New String] + + RemoveLowest --> AddString + AddString --> UpdateScores[Update Scores] + UpdateScores --> SortPool[Sort Pool by Score] + SortPool --> End([End]) + + style Start fill:#f9f,stroke:#333,stroke-width:2px + style End fill:#f9f,stroke:#333,stroke-width:2px +``` + +## Blocked Inverted Index Operations + +```mermaid +graph TD + Start([Start]) --> InputDoc[Input Document] + InputDoc --> ExtractTerms[Extract Terms] + ExtractTerms --> CheckBlocks[Check Existing Blocks] + + CheckBlocks -->|Block Found| UpdateBlock[Update Block] + CheckBlocks -->|New Block| CreateBlock[Create New Block] + + UpdateBlock --> MergeCheck[Check Merge Condition] + CreateBlock --> MergeCheck + + MergeCheck -->|Merge Needed| MergeBlocks[Merge Blocks] + MergeCheck -->|No Merge| UpdateIndex[Update Index] + + MergeBlocks --> UpdateIndex + UpdateIndex --> End([End]) + + style Start fill:#f9f,stroke:#333,stroke-width:2px + style End fill:#f9f,stroke:#333,stroke-width:2px +``` + +## Performance Measurement Flow + +```mermaid +graph TD + Start([Start]) --> StartTimer[Start Timer] + StartTimer --> Operation[Perform Operation] + Operation --> StopTimer[Stop Timer] + StopTimer --> RecordMetrics[Record Metrics] + RecordMetrics --> AnalyzePerformance[Analyze Performance] + AnalyzePerformance --> End([End]) + + style Start fill:#f9f,stroke:#333,stroke-width:2px + style End fill:#f9f,stroke:#333,stroke-width:2px +``` + +## Key Operations Description + +### System Initialization +1. Load configuration parameters +2. Initialize core components +3. Build data structures +4. Set up indexes + +### Query Processing +1. Parse and validate input +2. Check prefix in trie +3. Retrieve and score completions +4. Sort and return results + +### Search Operations +1. Process search terms +2. Lookup in dictionary +3. Retrieve and intersect posting lists +4. Score and rank results + +### String Pool Management +1. Maintain fixed-size pool +2. Handle insertions and removals +3. Update and sort scores +4. Manage memory efficiently + +### Blocked Index Operations +1. Process document updates +2. Manage block structure +3. Handle block merges +4. Maintain index consistency + +### Performance Measurement +1. Track operation timing +2. Record performance metrics +3. Analyze system behavior +4. Optimize based on results \ No newline at end of file diff --git a/doc/class_diagram.md b/doc/class_diagram.md new file mode 100644 index 0000000..4a5f4db --- /dev/null +++ b/doc/class_diagram.md @@ -0,0 +1,333 @@ +# C++ Class Diagram + +This document provides a comprehensive view of all classes in the C++ implementation and their relationships. + +## Main Class Diagram + +```mermaid +classDiagram + class Parameters { + +uint32_t num_terms + +uint32_t max_string_length + +uint32_t num_completions + +uint32_t universe + +uint32_t num_levels + +vector~uint32_t~ nodes_per_level + +string collection_basename + +load() + } + + class Probe { + <> + +start(id: uint64_t) + +stop(id: uint64_t) + } + + class NopProbe { + +start(id: uint64_t) + +stop(id: uint64_t) + } + + class TimerProbe { + -vector~Timer~ timers + +start(id: uint64_t) + +stop(id: uint64_t) + +get_duration(id: uint64_t) + } + + class Timer { + -Instant start_time + -Duration total_duration + +start() + +stop() + +get_duration() + } + + class ScoredStringPool { + -vector~id_type~ m_scores + -vector~size_t~ m_offsets + -vector~uint8_t~ m_data + +init() + +resize(size_t, uint32_t) + +clear() + +size() + +bytes() + +data() + +push_back_offset(size_t) + +scores() + +const_scores() + } + + class ScoredByteRange { + +byte_range string + +id_type score + } + + class TrieNode { + -unordered_map~char, TrieNode*~ children + -bool is_terminal + -vector~uint32_t~ completion_ids + +add_child(char) + +get_child(char) + +is_terminal() + } + + class CompletionTrie { + -TrieNode* root + -size_t num_nodes + -size_t num_completions + +insert(string) + +complete(string) + +remove(string) + +clear() + } + + class FCDictionary { + -vector~char~ data + -vector~uint32_t~ offsets + -size_t num_strings + -size_t total_size + +build(vector~string~) + +lookup(uint32_t) + +compress() + +decompress(uint32_t) + } + + class IntegerFCDictionary { + -vector~uint32_t~ m_headers + -vector~uint8_t~ m_buckets + -size_t m_size + +build(vector~string~) + +lookup(uint32_t) + +extract(id_type, completion_type) + } + + class Block { + -vector~uint32_t~ doc_ids + -uint32_t min_doc_id + -uint32_t max_doc_id + +add_doc(uint32_t) + +get_docs() + +get_range() + } + + class InvertedIndex { + -vector~Block~ blocks + -unordered_map~string, vector~uint32_t~~ term_to_blocks + -size_t block_size + +add_document(uint32_t, vector~string~) + +search(vector~string~) + +merge_blocks() + +clear() + } + + class CompactVector { + -vector~uint64_t~ m_bits + -uint8_t m_width + -uint64_t m_mask + +build(vector~uint64_t~) + +access(uint64_t) + +size() + } + + class BitVector { + -vector~uint64_t~ m_bits + -size_t m_size + +build(bit_vector_builder*) + +size() + +bytes() + +operator[](uint64_t) + +get_bits(uint64_t, uint64_t) + } + + class MinHeap { + -vector~T~ m_q + -Comparator m_comparator + +reserve(uint64_t) + +top() + +push(T) + +pop() + +clear() + +empty() + +size() + } + + class Autocomplete { + -Parameters params + -ScoredStringPool string_pool + -CompletionTrie trie + -FCDictionary dictionary + -InvertedIndex index + +build_index(string) + +complete(string) + +search(vector~string~) + } + + class Autocomplete2 { + -Parameters params + -ScoredStringPool string_pool + -CompletionTrie trie + -FCDictionary dictionary + -InvertedIndex index + -CompactVector docid_to_lexid + +build_index(string) + +complete(string) + +search(vector~string~) + } + + class Autocomplete3 { + -Parameters params + -ScoredStringPool string_pool + -CompletionTrie trie + -FCDictionary dictionary + -InvertedIndex index + -MinHeap min_priority_queue + +build_index(string) + +complete(string) + +search(vector~string~) + } + + class Autocomplete4 { + -Parameters params + -ScoredStringPool string_pool + -CompletionTrie trie + -FCDictionary dictionary + -BlockedInvertedIndex index + +build_index(string) + +complete(string) + +search(vector~string~) + } + + %% Relationships + Probe <|-- NopProbe + Probe <|-- TimerProbe + TimerProbe *-- Timer + Autocomplete *-- Parameters + Autocomplete *-- ScoredStringPool + Autocomplete *-- CompletionTrie + Autocomplete *-- FCDictionary + Autocomplete *-- InvertedIndex + CompletionTrie *-- TrieNode + InvertedIndex *-- Block + ScoredStringPool *-- ScoredByteRange + Autocomplete2 --|> Autocomplete + Autocomplete3 --|> Autocomplete + Autocomplete4 --|> Autocomplete + Autocomplete3 *-- MinHeap + Autocomplete2 *-- CompactVector + Autocomplete4 *-- BlockedInvertedIndex +``` + +## Component Dependencies + +```mermaid +graph TD + subgraph Core + Parameters + Probe + Timer + end + + subgraph Data Structures + ScoredStringPool + CompletionTrie + FCDictionary + IntegerFCDictionary + InvertedIndex + BlockedInvertedIndex + CompactVector + BitVector + MinHeap + end + + subgraph Implementation + Autocomplete + Autocomplete2 + Autocomplete3 + Autocomplete4 + end + + %% Dependencies + Parameters --> ScoredStringPool + Parameters --> CompletionTrie + Parameters --> FCDictionary + Parameters --> InvertedIndex + Parameters --> IntegerFCDictionary + + ScoredStringPool --> Autocomplete + CompletionTrie --> Autocomplete + FCDictionary --> Autocomplete + InvertedIndex --> Autocomplete + IntegerFCDictionary --> Autocomplete2 + CompactVector --> Autocomplete2 + MinHeap --> Autocomplete3 + BlockedInvertedIndex --> Autocomplete4 + + style Core fill:#f9f,stroke:#333,stroke-width:2px + style Data Structures fill:#9f9,stroke:#333,stroke-width:2px + style Implementation fill:#99f,stroke:#333,stroke-width:2px +``` + +## Memory Layout + +```mermaid +graph TD + subgraph Memory Organization + direction TB + Stack[Stack Memory] --> Heap[Heap Memory] + Heap --> Data[Data Structures] + Data --> Strings[String Pool] + Data --> Trie[Trie Nodes] + Data --> Dict[Dictionary] + Data --> Index[Inverted Index] + Data --> Compact[Compact Vectors] + Data --> BitVec[Bit Vectors] + end + + style Memory Organization fill:#f9f,stroke:#333,stroke-width:2px +``` + +## Key Features and Methods + +### Core Components +- **Parameters**: Configuration management +- **Probe**: Performance measurement interface +- **Timer**: Time tracking implementation + +### Data Structures +- **ScoredStringPool**: String and score management +- **CompletionTrie**: Prefix-based completion +- **FCDictionary**: String compression +- **IntegerFCDictionary**: Integer-based dictionary +- **InvertedIndex**: Term-based search +- **BlockedInvertedIndex**: Blocked term-based search +- **CompactVector**: Space-efficient vector +- **BitVector**: Bit-level operations +- **MinHeap**: Priority queue implementation + +### Main Implementation +- **Autocomplete**: Base implementation +- **Autocomplete2**: Integer-based optimization +- **Autocomplete3**: Min-heap based optimization +- **Autocomplete4**: Blocked index optimization + +## Usage Example + +```cpp +// Initialize components +Parameters params; +params.load("config.stats"); + +ScoredStringPool pool(POOL_SIZE); +CompletionTrie trie; +FCDictionary dict; +InvertedIndex index; + +// Build autocomplete system +Autocomplete ac(params, pool, trie, dict, index); +ac.build_index("data.txt"); + +// Use the system +auto completions = ac.complete("hello"); +auto results = ac.search({"hello", "world"}); +``` \ No newline at end of file diff --git a/doc/component_diagram.md b/doc/component_diagram.md new file mode 100644 index 0000000..5c9fd83 --- /dev/null +++ b/doc/component_diagram.md @@ -0,0 +1,45 @@ +# Component Relationships + +```mermaid +graph TD + subgraph Core + Constants[Constants] + Parameters[Parameters] + Probe[Performance Probe] + end + + subgraph Data Structures + StringPool[String Pool] + Trie[Completion Trie] + Dictionary[Front-Coded Dictionary] + Index[Blocked Inverted Index] + end + + subgraph Pipeline + Input[Input Processing] + Build[Index Building] + Query[Query Processing] + end + + %% Core Dependencies + Constants --> Parameters + Parameters --> StringPool + Parameters --> Trie + Parameters --> Dictionary + Parameters --> Index + Probe --> Query + + %% Data Structure Dependencies + Dictionary --> Trie + Trie --> Index + StringPool --> Dictionary + StringPool --> Trie + StringPool --> Index + + %% Pipeline Dependencies + Input --> Build + Build --> Query + Query --> Trie + Query --> Index + Query --> Dictionary +``` \ No newline at end of file diff --git a/doc/cpp_structure.md b/doc/cpp_structure.md new file mode 100644 index 0000000..bfa42d0 --- /dev/null +++ b/doc/cpp_structure.md @@ -0,0 +1,153 @@ +# C++ Code Structure Documentation + +This document outlines the structure of the original C++ implementation that is being ported to Rust. + +## Core Components + +### 1. Constants and Configuration +- **File**: `constants.hpp` +- **Purpose**: Defines system-wide constants and limits +- **Key Constants**: + - `MAX_K`: Maximum number of completions + - `MAX_NUM_TERMS_PER_QUERY`: Maximum terms per query + - `MAX_NUM_CHARS_PER_QUERY`: Maximum characters per query + - `POOL_SIZE`: Size of the string pool + +### 2. Parameters Management +- **File**: `parameters.hpp` +- **Purpose**: Manages system configuration parameters +- **Key Struct**: `parameters` + - `num_terms`: Total number of terms + - `max_string_length`: Maximum string length + - `num_completions`: Number of completions + - `universe`: Size of the universe + - `num_levels`: Number of levels in the index + - `nodes_per_level`: Vector of nodes per level + - `collection_basename`: Base name for collection files + +### 3. Performance Measurement +- **File**: `probe.hpp` +- **Purpose**: Performance measurement and timing +- **Key Structs**: + - `nop_probe`: No-operation probe + - `timer_probe`: Timer-based performance measurement + +### 4. String Pool Management +- **File**: `scored_string_pool.hpp` +- **Purpose**: Manages a pool of scored strings +- **Key Components**: + - String storage + - Score management + - Pool operations + +### 5. Completion Trie +- **File**: `completion_trie.hpp` +- **Purpose**: Implements the completion trie data structure +- **Key Features**: + - Prefix-based completion + - Node management + - Traversal operations + +### 6. Blocked Inverted Index +- **File**: `blocked_inverted_index.hpp` +- **Purpose**: Implements blocked inverted indexing +- **Key Components**: + - Block management + - Index operations + - Query processing + +### 7. Front-Coded Dictionary +- **File**: `fc_dictionary.hpp` +- **Purpose**: Implements front-coding for dictionary compression +- **Key Features**: + - String compression + - Dictionary operations + - Lookup functionality + +## Data Pipeline + +1. **Input Processing** + - Read input completions + - Sort lexicographically + - Generate statistics + +2. **Index Building** + - Build front-coded dictionary + - Construct completion trie + - Create blocked inverted index + +3. **Query Processing** + - Parse input query + - Traverse completion trie + - Search inverted index + - Return top-k completions + +## Key Methods and Operations + +### Dictionary Operations +```cpp +// Front-coded dictionary +void build_dictionary(); +void compress_strings(); +std::string lookup(uint32_t id); +``` + +### Trie Operations +```cpp +// Completion trie +void insert(const std::string& completion); +std::vector complete(const std::string& prefix); +``` + +### Index Operations +```cpp +// Blocked inverted index +void build_index(); +std::vector search(const std::vector& terms); +``` + +### Query Processing +```cpp +// Query handling +std::vector process_query(const std::string& query); +void rank_completions(std::vector& completions); +``` + +## Dependencies and Relationships + +1. **Core Dependencies** + - Constants → Parameters + - Parameters → All major components + - Probe → Performance measurement + +2. **Data Structure Dependencies** + - Front-coded Dictionary → Completion Trie + - Completion Trie → Blocked Inverted Index + - All components → String Pool + +3. **Pipeline Dependencies** + - Input Processing → Index Building + - Index Building → Query Processing + - Query Processing → All components + +## Porting Strategy + +1. **Phase 1: Core Components** + - Constants and configuration + - Parameters management + - Performance measurement + +2. **Phase 2: Data Structures** + - String pool + - Completion trie + - Front-coded dictionary + +3. **Phase 3: Index and Query** + - Blocked inverted index + - Query processing + - Pipeline integration + +4. **Phase 4: Testing and Optimization** + - Unit tests + - Integration tests + - Performance optimization \ No newline at end of file diff --git a/doc/data_structures.md b/doc/data_structures.md new file mode 100644 index 0000000..9da8761 --- /dev/null +++ b/doc/data_structures.md @@ -0,0 +1,253 @@ +# Data Structures Documentation + +This document details the key data structures used in the autocomplete system. + +## 1. Scored String Pool + +### Purpose +Manages a fixed-size pool of strings with associated scores, optimized for fast retrieval and updates. + +### Structure +```cpp +struct scored_string_pool { + std::vector strings; // String storage + std::vector scores; // Associated scores + size_t size; // Current pool size + size_t capacity; // Maximum capacity +}; +``` + +### Visualization +```mermaid +graph TD + subgraph String Pool + direction LR + S1[String 1] --> SC1[Score 0.8] + S2[String 2] --> SC2[Score 0.6] + S3[String 3] --> SC3[Score 0.9] + S4[String 4] --> SC4[Score 0.7] + end + style String Pool fill:#f9f,stroke:#333,stroke-width:2px +``` + +### Key Operations +- `insert(string, score)`: Add a new string with its score +- `get_score(index)`: Retrieve score for a string +- `get_string(index)`: Retrieve string by index +- `update_score(index, score)`: Update score for a string +- `clear()`: Reset the pool + +### Memory Management +- Fixed-size allocation to prevent reallocations +- Contiguous memory layout for cache efficiency +- Score and string data stored separately for better cache utilization + +## 2. Completion Trie + +### Purpose +Efficient prefix-based string completion using a trie data structure. + +### Structure +```cpp +struct trie_node { + std::unordered_map children; + bool is_terminal; + std::vector completion_ids; +}; + +struct completion_trie { + trie_node* root; + size_t num_nodes; + size_t num_completions; +}; +``` + +### Visualization +```mermaid +graph TD + Root((Root)) --> H((h)) + H --> HE((e)) + HE --> HEL((l)) + HEL --> HELL((l)) + HELL --> HELLO((o)) + HELLO --> HELLOW((w)) + HELLOW --> HELLOWO((o)) + HELLOWO --> HELLOWOR((r)) + HELLOWOR --> HELLOWORL((l)) + HELLOWORL --> HELLOWORLD((d)) + + style Root fill:#f9f,stroke:#333,stroke-width:2px + style HELLOWORLD fill:#9f9,stroke:#333,stroke-width:2px +``` + +### Key Operations +- `insert(completion)`: Add a new completion string +- `complete(prefix)`: Find all completions for a prefix +- `remove(completion)`: Remove a completion string +- `clear()`: Reset the trie + +### Optimizations +- Path compression for common prefixes +- Node sharing for similar completions +- Lazy deletion for better performance + +## 3. Front-Coded Dictionary + +### Purpose +Compressed string dictionary using front-coding technique. + +### Structure +```cpp +struct fc_dictionary { + std::vector data; // Compressed string data + std::vector offsets; // String offsets + size_t num_strings; // Number of strings + size_t total_size; // Total compressed size +}; +``` + +### Visualization +```mermaid +graph LR + subgraph Front-Coded Dictionary + direction LR + S1[hello] --> |shared prefix| S2[helloworld] + S2 --> |shared prefix| S3[hellothere] + S3 --> |shared prefix| S4[hellokitty] + end + style Front-Coded Dictionary fill:#f9f,stroke:#333,stroke-width:2px +``` + +### Key Operations +- `build(strings)`: Build dictionary from string list +- `lookup(id)`: Retrieve string by ID +- `compress()`: Apply front-coding compression +- `decompress(id)`: Decompress a specific string + +### Compression Details +- Common prefixes shared between consecutive strings +- Variable-length encoding for shared prefix lengths +- Delta encoding for string differences + +## 4. Blocked Inverted Index + +### Purpose +Efficient term-based search using blocked inverted indexing. + +### Structure +```cpp +struct block { + std::vector doc_ids; // Document IDs in block + uint32_t min_doc_id; // Minimum doc ID in block + uint32_t max_doc_id; // Maximum doc ID in block +}; + +struct inverted_index { + std::vector blocks; // Index blocks + std::unordered_map> term_to_blocks; + size_t block_size; // Size of each block +}; +``` + +### Visualization +```mermaid +graph TD + subgraph Inverted Index + direction TB + T1[Term 1] --> B1[Block 1] + T1 --> B2[Block 2] + T2[Term 2] --> B2 + T2 --> B3[Block 3] + T3[Term 3] --> B1 + T3 --> B3 + + subgraph Block 1 + D1[Doc 1] + D2[Doc 2] + D3[Doc 3] + end + + subgraph Block 2 + D4[Doc 4] + D5[Doc 5] + end + + subgraph Block 3 + D6[Doc 6] + D7[Doc 7] + end + end + style Inverted Index fill:#f9f,stroke:#333,stroke-width:2px +``` + +### Key Operations +- `add_document(doc_id, terms)`: Add document to index +- `search(terms)`: Find documents containing terms +- `merge_blocks()`: Optimize block structure +- `clear()`: Reset the index + +### Blocking Strategy +- Fixed-size blocks for predictable memory usage +- Block-level compression for space efficiency +- Skip pointers for faster traversal + +## Memory and Performance Considerations + +### Memory Layout +1. **Contiguous Storage** + - Strings stored in contiguous memory + - Scores aligned for SIMD operations + - Block data packed efficiently + +2. **Cache Optimization** + - Hot data kept together + - Cold data separated + - Alignment for cache lines + +### Performance Optimizations +1. **String Operations** + - String interning for deduplication + - Small string optimization + - Custom string comparison + +2. **Search Optimizations** + - Block-level skipping + - Term frequency caching + - Result set intersection optimization + +3. **Memory Management** + - Custom allocators for specific structures + - Memory pooling for frequent allocations + - Lazy initialization where appropriate + +## Usage Examples + +### String Pool Usage +```cpp +scored_string_pool pool(POOL_SIZE); +pool.insert("completion1", 0.8); +pool.insert("completion2", 0.6); +auto completions = pool.get_top_k(10); +``` + +### Trie Usage +```cpp +completion_trie trie; +trie.insert("hello world"); +trie.insert("hello there"); +auto results = trie.complete("hello"); +``` + +### Dictionary Usage +```cpp +fc_dictionary dict; +dict.build(strings); +auto str = dict.lookup(42); +``` + +### Index Usage +```cpp +inverted_index index; +index.add_document(1, {"term1", "term2"}); +auto docs = index.search({"term1", "term2"}); +``` \ No newline at end of file diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt deleted file mode 100644 index d4722aa..0000000 --- a/external/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -include_directories(essentials/include) \ No newline at end of file diff --git a/external/essentials b/external/essentials deleted file mode 160000 index 3721ea2..0000000 --- a/external/essentials +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 3721ea2b02c24005088cb9efeb89b4090753bbf2 diff --git a/external/jQuery-Autocomplete b/external/jQuery-Autocomplete deleted file mode 160000 index 0ba2565..0000000 --- a/external/jQuery-Autocomplete +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0ba256501bc365814f43066999f51f0619e739a9 diff --git a/external/mongoose b/external/mongoose deleted file mode 160000 index c41a221..0000000 --- a/external/mongoose +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c41a22195ceabc02ffd0379f0e71d6c3575337aa diff --git a/include/autocomplete.hpp b/include/autocomplete.hpp deleted file mode 100644 index 9f01ed0..0000000 --- a/include/autocomplete.hpp +++ /dev/null @@ -1,343 +0,0 @@ -#pragma once - -#include "util_types.hpp" -#include "autocomplete_common.hpp" -#include "scored_string_pool.hpp" -#include "constants.hpp" - -namespace autocomplete { - -template -struct autocomplete { - typedef scored_string_pool::iterator iterator_type; - - autocomplete() { - m_pool.resize(constants::POOL_SIZE, constants::MAX_K); - } - - autocomplete(parameters const& params) - : autocomplete() { - typename Completions::builder cm_builder(params); - typename Dictionary::builder di_builder(params); - typename InvertedIndex::builder ii_builder(params); - typename ForwardIndex::builder fi_builder(params); - - m_unsorted_docs_list.build(cm_builder.doc_ids()); - m_unsorted_minimal_docs_list.build(ii_builder.minimal_doc_ids()); - - cm_builder.build(m_completions); - di_builder.build(m_dictionary); - ii_builder.build(m_inverted_index); - fi_builder.build(m_forward_index); - } - - iterator_type prefix_topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix; - parse(m_dictionary, query, prefix, suffix); - - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - // NOTE: because the completion_trie works with 1-based ids - // (id 0 is reserved for null terminator) - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - if (r.is_invalid()) return m_pool.begin(); - - uint32_t num_completions = - m_unsorted_docs_list.topk(r, k, m_pool.scores()); - return extract_strings(num_completions); - } - - iterator_type conjunctive_topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - - uint32_t num_completions = 0; - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - if (num_terms == 1) { // special case - suffix_lex_range.end += 1; - num_completions = m_unsorted_minimal_docs_list.topk( - suffix_lex_range, k, m_pool.scores(), - true // must return unique results - ); - } else { - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } - } - - return extract_strings(num_completions); - } - - iterator_type topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - - init(); - completion_type prefix; - byte_range suffix; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - - uint32_t num_completions = 0; - if (!r.is_invalid()) { - num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - } - - if (num_completions < k) { - if (num_terms == 1) { // special case - suffix_lex_range.begin -= 1; - num_completions = m_unsorted_minimal_docs_list.topk( - suffix_lex_range, k, m_pool.scores(), - true // must return unique results - ); - } else { - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } - } - } - - return extract_strings(num_completions); - } - - iterator_type topk(std::string const& query, const uint32_t k, - std::vector& timers) { - assert(k <= constants::MAX_K); - - // step 1: parsing - timers[0].start(); - init(); - completion_type prefix; - byte_range suffix; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - timers[0].stop(); - - // step 2: prefix search - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - uint32_t num_completions = 0; - if (!r.is_invalid()) { - num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - } - timers[1].stop(); - - // step 3: conjunctive search - timers[2].start(); - if (num_completions < k) { - if (num_terms == 1) { // special case - suffix_lex_range.begin -= 1; - num_completions = m_unsorted_minimal_docs_list.topk( - suffix_lex_range, k, m_pool.scores(), - true // must return unique results - ); - } else { - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } - } - } - timers[2].stop(); - - // step 4: reporting - timers[3].start(); - auto it = extract_strings(num_completions); - timers[3].stop(); - - return it; - } - - // for benchmarking - iterator_type prefix_topk(std::string const& query, uint32_t const k, - std::vector& timers) { - // step 0 - timers[0].start(); - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix{0, 0}; - parse(m_dictionary, query, prefix, suffix); - timers[0].stop(); - - // step 1 - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - if (r.is_invalid()) return m_pool.begin(); - timers[1].stop(); - - // step 2 - timers[2].start(); - uint32_t num_completions = - m_unsorted_docs_list.topk(r, k, m_pool.scores()); - timers[2].stop(); - - // step 3 - timers[3].start(); - auto it = extract_strings(num_completions); - timers[3].stop(); - - return it; - } - - // for benchmarking - iterator_type conjunctive_topk(std::string const& query, uint32_t const k, - std::vector& timers) { - // step 0 - timers[0].start(); - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix{0, 0}; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - timers[0].stop(); - - uint32_t num_completions = 0; - - // step 1 - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - timers[1].stop(); - - // step 2 - timers[2].start(); - if (num_terms == 1) { // special case - - suffix_lex_range.end += 1; - num_completions = m_unsorted_minimal_docs_list.topk( - suffix_lex_range, k, m_pool.scores(), - true // must return unique results - ); - - } else { - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } - } - timers[2].stop(); - - // step 3 - timers[3].start(); - auto it = extract_strings(num_completions); - timers[3].stop(); - - return it; - } - - size_t bytes() const { - return m_completions.bytes() + m_unsorted_docs_list.bytes() + - m_unsorted_minimal_docs_list.bytes() + m_dictionary.bytes() + - m_inverted_index.bytes() + m_forward_index.bytes(); - } - - void print_stats() const; - - template - void visit(Visitor& visitor) { - visitor.visit(m_completions); - visitor.visit(m_unsorted_docs_list); - visitor.visit(m_unsorted_minimal_docs_list); - visitor.visit(m_dictionary); - visitor.visit(m_inverted_index); - visitor.visit(m_forward_index); - } - -private: - Completions m_completions; - UnsortedDocsList m_unsorted_docs_list; - UnsortedDocsList m_unsorted_minimal_docs_list; - Dictionary m_dictionary; - InvertedIndex m_inverted_index; - ForwardIndex m_forward_index; - - scored_string_pool m_pool; - - void init() { - m_pool.clear(); - m_pool.init(); - assert(m_pool.size() == 0); - } - - template - uint32_t conjunctive_topk(Iterator& it, const range r, uint32_t const k) { - auto& topk_scores = m_pool.scores(); - uint32_t results = 0; - for (; it.has_next(); ++it) { - auto doc_id = *it; - if (m_forward_index.intersects(doc_id, r)) { - topk_scores[results++] = doc_id; - if (results == k) break; - } - } - return results; - } - - iterator_type extract_strings(const uint32_t num_completions) { - auto const& topk_scores = m_pool.scores(); - for (uint32_t i = 0; i != num_completions; ++i) { - auto doc_id = topk_scores[i]; - auto it = m_forward_index.iterator(doc_id); - uint64_t offset = m_pool.bytes(); - uint8_t* decoded = m_pool.data() + offset; - for (uint32_t j = 0; j != it.size(); ++j, ++it) { - auto term_id = *it; - uint8_t len = m_dictionary.extract(term_id, decoded); - decoded += len; - offset += len; - if (j != it.size() - 1) { - *decoded++ = ' '; - offset++; - } - } - m_pool.push_back_offset(offset); - } - assert(m_pool.size() == num_completions); - return m_pool.begin(); - } -}; -} // namespace autocomplete \ No newline at end of file diff --git a/include/autocomplete2.hpp b/include/autocomplete2.hpp deleted file mode 100644 index 3003c02..0000000 --- a/include/autocomplete2.hpp +++ /dev/null @@ -1,400 +0,0 @@ -#pragma once - -#include "util_types.hpp" -#include "building_util.hpp" -#include "compact_vector.hpp" -#include "autocomplete_common.hpp" -#include "scored_string_pool.hpp" -#include "constants.hpp" - -namespace autocomplete { - -template -struct autocomplete2 { - typedef scored_string_pool::iterator iterator_type; - - autocomplete2() { - m_pool.resize(constants::POOL_SIZE, constants::MAX_K); - m_topk_completion_set.resize(constants::MAX_K, - 2 * constants::MAX_NUM_TERMS_PER_QUERY); - } - - autocomplete2(parameters const& params) - : autocomplete2() { - typename Completions::builder cm_builder(params); - typename Dictionary::builder di_builder(params); - typename InvertedIndex::builder ii_builder(params); - - auto const& doc_ids = cm_builder.doc_ids(); - m_unsorted_docs_list.build(doc_ids); - m_unsorted_minimal_docs_list.build(ii_builder.minimal_doc_ids()); - - { - essentials::logger("building map from doc_id to lex_id..."); - uint64_t n = doc_ids.size(); - typedef std::vector> id_map_type; - id_map_type ids; - ids.reserve(n); - for (id_type lex_id = 0; lex_id != n; ++lex_id) { - ids.emplace_back(lex_id, doc_ids[lex_id]); - } - std::sort(ids.begin(), ids.end(), [](auto const& l, auto const& r) { - return l.second < r.second; - }); - m_docid_to_lexid.build( - util::first_iterator( - ids.begin()), - ids.size()); - essentials::logger("DONE"); - } - - cm_builder.build(m_completions); - di_builder.build(m_dictionary); - ii_builder.build(m_inverted_index); - } - - iterator_type prefix_topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix; - parse(m_dictionary, query, prefix, suffix); - - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - if (r.is_invalid()) return m_pool.begin(); - - uint32_t num_completions = - m_unsorted_docs_list.topk(r, k, m_pool.scores()); - extract_completions(num_completions); - return extract_strings(num_completions); - } - - iterator_type conjunctive_topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - uint32_t num_completions = 0; - - if (num_terms == 1) { // special case - suffix_lex_range.end += 1; - num_completions = m_unsorted_minimal_docs_list.topk( - suffix_lex_range, k, m_pool.scores(), - true // must return unique results - ); - extract_completions(num_completions); - } else { - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } - } - - return extract_strings(num_completions); - } - - iterator_type topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - - uint32_t num_completions = 0; - if (!r.is_invalid()) { - num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - } - - if (num_completions < k) { - if (num_terms == 1) { // special case - suffix_lex_range.begin -= 1; - num_completions = m_unsorted_minimal_docs_list.topk( - suffix_lex_range, k, m_pool.scores(), - true // must return unique results - ); - extract_completions(num_completions); - } else { - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } - } - } else { - extract_completions(num_completions); - } - - return extract_strings(num_completions); - } - - iterator_type topk(std::string const& query, const uint32_t k, - std::vector& timers) { - assert(k <= constants::MAX_K); - - timers[0].start(); - init(); - completion_type prefix; - byte_range suffix; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - timers[0].stop(); - - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - uint32_t num_completions = 0; - if (!r.is_invalid()) { - num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - } - timers[1].stop(); - - timers[2].start(); - if (num_completions < k) { - if (num_terms == 1) { // special case - suffix_lex_range.begin -= 1; - num_completions = m_unsorted_minimal_docs_list.topk( - suffix_lex_range, k, m_pool.scores(), - true // must return unique results - ); - extract_completions(num_completions); - } else { - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } - } - } else { - extract_completions(num_completions); - } - timers[2].stop(); - - timers[3].start(); - auto it = extract_strings(num_completions); - timers[3].stop(); - - return it; - } - - // for benchmarking - iterator_type prefix_topk(std::string const& query, uint32_t const k, - std::vector& timers) { - // step 0 - timers[0].start(); - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix{0, 0}; - parse(m_dictionary, query, prefix, suffix); - timers[0].stop(); - - // step 1 - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - if (r.is_invalid()) return m_pool.begin(); - timers[1].stop(); - - // step 2 - timers[2].start(); - uint32_t num_completions = - m_unsorted_docs_list.topk(r, k, m_pool.scores()); - timers[2].stop(); - - // step 3 - timers[3].start(); - extract_completions(num_completions); - auto it = extract_strings(num_completions); - timers[3].stop(); - - return it; - } - - // for benchmarking - iterator_type conjunctive_topk(std::string const& query, uint32_t const k, - std::vector& timers) { - // step 0 - timers[0].start(); - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix{0, 0}; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - timers[0].stop(); - - uint32_t num_completions = 0; - - // step 1 - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - timers[1].stop(); - - // step 2 - timers[2].start(); - if (num_terms == 1) { // special case - suffix_lex_range.end += 1; - num_completions = m_unsorted_minimal_docs_list.topk( - suffix_lex_range, k, m_pool.scores(), - true // must return unique results - ); - extract_completions(num_completions); - } else { - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } - } - timers[2].stop(); - - // step 3 - timers[3].start(); - auto it = extract_strings(num_completions); - timers[3].stop(); - - return it; - } - - size_t bytes() const { - return m_completions.bytes() + m_unsorted_docs_list.bytes() + - m_unsorted_minimal_docs_list.bytes() + m_dictionary.bytes() + - m_docid_to_lexid.bytes() + m_inverted_index.bytes(); - } - - void print_stats() const; - - template - void visit(Visitor& visitor) { - visitor.visit(m_completions); - visitor.visit(m_unsorted_docs_list); - visitor.visit(m_unsorted_minimal_docs_list); - visitor.visit(m_dictionary); - visitor.visit(m_inverted_index); - visitor.visit(m_docid_to_lexid); - } - -private: - Completions m_completions; - UnsortedDocsList m_unsorted_docs_list; - UnsortedDocsList m_unsorted_minimal_docs_list; - Dictionary m_dictionary; - InvertedIndex m_inverted_index; - compact_vector m_docid_to_lexid; - - scored_string_pool m_pool; - completion_set m_topk_completion_set; - - void init() { - m_pool.clear(); - m_pool.init(); - assert(m_pool.size() == 0); - } - - // NOTE: this can be done more efficienctly exploiting - // the fact that the strings to be extracted share a common - // prefix, thus this task should be delegated to the - // integer_fc_dictionary... (enchance the locality of the operation) - // NOTE: this only work when used during the prefix_topk step. - void extract_completions(const uint32_t num_completions) { - auto const& topk_scores = m_pool.scores(); - auto& completions = m_topk_completion_set.completions(); - auto& sizes = m_topk_completion_set.sizes(); - for (uint32_t i = 0; i != num_completions; ++i) { - auto doc_id = topk_scores[i]; - auto lex_id = m_docid_to_lexid[doc_id]; - uint8_t size = m_completions.extract(lex_id, completions[i]); - sizes[i] = size; - } - } - - template - uint32_t conjunctive_topk(Iterator& it, const range r, const uint32_t k) { - auto& topk_scores = m_pool.scores(); - auto& completions = m_topk_completion_set.completions(); - auto& sizes = m_topk_completion_set.sizes(); - uint32_t i = 0; - - for (; it.has_next(); ++it) { - auto doc_id = *it; - auto lex_id = m_docid_to_lexid[doc_id]; - uint32_t size = m_completions.extract(lex_id, completions[i]); - - bool found = false; - for (uint32_t j = 0; j != size and !found; ++j) { - if (r.contains(completions[i][j])) found = true; - } - - if (found) { - topk_scores[i] = doc_id; - sizes[i] = size; - ++i; - if (i == k) break; - } - } - - return i; - } - - iterator_type extract_strings(const uint32_t num_completions) { - auto const& completions = m_topk_completion_set.completions(); - auto const& sizes = m_topk_completion_set.sizes(); - for (uint32_t i = 0; i != num_completions; ++i) { - auto const& c = completions[i]; - uint32_t size = sizes[i]; - uint64_t offset = m_pool.bytes(); - uint8_t* decoded = m_pool.data() + offset; - for (uint32_t j = 0; j != size; ++j) { - auto term_id = c[j]; - uint8_t len = m_dictionary.extract(term_id, decoded); - decoded += len; - offset += len; - if (j != size - 1) { - *decoded++ = ' '; - offset++; - } - } - m_pool.push_back_offset(offset); - } - assert(m_pool.size() == num_completions); - return m_pool.begin(); - } -}; -} // namespace autocomplete \ No newline at end of file diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp deleted file mode 100644 index 550aac5..0000000 --- a/include/autocomplete3.hpp +++ /dev/null @@ -1,388 +0,0 @@ -#pragma once - -#include "util_types.hpp" -#include "building_util.hpp" -#include "compact_vector.hpp" -#include "autocomplete_common.hpp" -#include "scored_string_pool.hpp" -#include "min_heap.hpp" -#include "constants.hpp" - -namespace autocomplete { - -/* -During the conjunctive step, maintain a min-heap of iterators, -one iterator for each termID in the lexicographic range of the -last token of the query. -*/ - -template -struct autocomplete3 { - typedef scored_string_pool::iterator iterator_type; - typedef min_heap> - min_priority_queue_type; - - autocomplete3() { - m_pool.resize(constants::POOL_SIZE, constants::MAX_K); - m_topk_completion_set.resize(constants::MAX_K, - 2 * constants::MAX_NUM_TERMS_PER_QUERY); - } - - autocomplete3(parameters const& params) - : autocomplete3() { - typename Completions::builder cm_builder(params); - typename Dictionary::builder di_builder(params); - typename InvertedIndex::builder ii_builder(params); - - auto const& doc_ids = cm_builder.doc_ids(); - m_unsorted_docs_list.build(doc_ids); - - { - essentials::logger("building map from doc_id to lex_id..."); - uint64_t n = doc_ids.size(); - typedef std::vector> id_map_type; - id_map_type ids; - ids.reserve(n); - for (id_type lex_id = 0; lex_id != n; ++lex_id) { - ids.emplace_back(lex_id, doc_ids[lex_id]); - } - std::sort(ids.begin(), ids.end(), [](auto const& l, auto const& r) { - return l.second < r.second; - }); - m_docid_to_lexid.build( - util::first_iterator( - ids.begin()), - ids.size()); - essentials::logger("DONE"); - } - - cm_builder.build(m_completions); - di_builder.build(m_dictionary); - ii_builder.build(m_inverted_index); - } - - iterator_type prefix_topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix; - parse(m_dictionary, query, prefix, suffix); - - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - if (r.is_invalid()) return m_pool.begin(); - - uint32_t num_completions = - m_unsorted_docs_list.topk(r, k, m_pool.scores()); - extract_completions(num_completions); - return extract_strings(num_completions); - } - - iterator_type conjunctive_topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix; - parse(m_dictionary, query, prefix, suffix); - - uint32_t num_completions = 0; - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } - - extract_completions(num_completions); - return extract_strings(num_completions); - } - - iterator_type topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - - uint32_t num_completions = 0; - if (!r.is_invalid()) { - num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - } - - if (num_completions < k) { - if (num_terms == 1) { // we've got nothing to intersect - iterator it(0, m_inverted_index.num_docs()); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } - } - - extract_completions(num_completions); - return extract_strings(num_completions); - } - - iterator_type topk(std::string const& query, const uint32_t k, - std::vector& timers) { - assert(k <= constants::MAX_K); - - timers[0].start(); - init(); - completion_type prefix; - byte_range suffix; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - timers[0].stop(); - - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - uint32_t num_completions = 0; - if (!r.is_invalid()) { - num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - } - timers[1].stop(); - - timers[2].start(); - if (num_completions < k) { - if (num_terms == 1) { // we've got nothing to intersect - iterator it(0, m_inverted_index.num_docs()); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } - } - timers[2].stop(); - - timers[3].start(); - extract_completions(num_completions); - auto it = extract_strings(num_completions); - timers[3].stop(); - - return it; - } - - // for benchmarking - iterator_type prefix_topk(std::string const& query, uint32_t const k, - std::vector& timers) { - // step 0 - timers[0].start(); - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix{0, 0}; - parse(m_dictionary, query, prefix, suffix); - timers[0].stop(); - - // step 1 - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - if (r.is_invalid()) return m_pool.begin(); - timers[1].stop(); - - // step 2 - timers[2].start(); - uint32_t num_completions = - m_unsorted_docs_list.topk(r, k, m_pool.scores()); - timers[2].stop(); - - // step 3 - timers[3].start(); - extract_completions(num_completions); - auto it = extract_strings(num_completions); - timers[3].stop(); - - return it; - } - - // for benchmarking - iterator_type conjunctive_topk(std::string const& query, uint32_t const k, - std::vector& timers) { - // step 0 - timers[0].start(); - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix{0, 0}; - parse(m_dictionary, query, prefix, suffix); - timers[0].stop(); - - uint32_t num_completions = 0; - - // step 1 - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - timers[1].stop(); - - // step 2 - timers[2].start(); - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } - timers[2].stop(); - - // step 3 - timers[3].start(); - extract_completions(num_completions); - auto it = extract_strings(num_completions); - timers[3].stop(); - - return it; - } - - size_t bytes() const { - return m_completions.bytes() + m_unsorted_docs_list.bytes() + - m_dictionary.bytes() + m_docid_to_lexid.bytes() + - m_inverted_index.bytes(); - } - - void print_stats() const; - - template - void visit(Visitor& visitor) { - visitor.visit(m_completions); - visitor.visit(m_unsorted_docs_list); - visitor.visit(m_dictionary); - visitor.visit(m_inverted_index); - visitor.visit(m_docid_to_lexid); - } - -private: - Completions m_completions; - UnsortedDocsList m_unsorted_docs_list; - Dictionary m_dictionary; - InvertedIndex m_inverted_index; - compact_vector m_docid_to_lexid; - - scored_string_pool m_pool; - completion_set m_topk_completion_set; - - void init() { - m_pool.clear(); - m_pool.init(); - assert(m_pool.size() == 0); - } - - // NOTE: this can be done more efficienctly exploiting - // the fact that the strings to be extracted share a common - // prefix, thus this task should be delegated to the - // integer_fc_dictionary... (enchance the locality of the operation) - // NOTE: this only work when used during the prefix_topk step. - void extract_completions(const uint32_t num_completions) { - auto const& topk_scores = m_pool.scores(); - auto& completions = m_topk_completion_set.completions(); - auto& sizes = m_topk_completion_set.sizes(); - for (uint32_t i = 0; i != num_completions; ++i) { - auto doc_id = topk_scores[i]; - auto lex_id = m_docid_to_lexid[doc_id]; - uint8_t size = m_completions.extract(lex_id, completions[i]); - sizes[i] = size; - } - } - - template - uint32_t conjunctive_topk(Iterator& it, const range r, const uint32_t k) { - assert(!r.is_invalid()); - - auto& topk_scores = m_pool.scores(); - min_priority_queue_type q; - q.reserve(r.end - r.begin + 1); // inclusive range - assert(r.begin > 0); - for (uint64_t term_id = r.begin; term_id <= r.end; ++term_id) { - q.push_back(m_inverted_index.iterator(term_id - 1)); - } - q.make_heap(); - - uint32_t results = 0; - for (; it.has_next() and !q.empty(); ++it) { - auto doc_id = *it; - - bool found = false; - while (!q.empty() and !found) { - auto& z = q.top(); - auto val = *z; - if (val > doc_id) break; - if (val < doc_id) { - val = z.next_geq(doc_id); - if (!z.has_next()) { - q.pop(); - } else { - q.heapify(); - } - } - if (val == doc_id) found = true; - } - - if (found) { - topk_scores[results++] = doc_id; - if (results == k) break; - } - } - - return results; - } - - iterator_type extract_strings(const uint32_t num_completions) { - auto const& completions = m_topk_completion_set.completions(); - auto const& sizes = m_topk_completion_set.sizes(); - for (uint32_t i = 0; i != num_completions; ++i) { - auto const& c = completions[i]; - uint32_t size = sizes[i]; - uint64_t offset = m_pool.bytes(); - uint8_t* decoded = m_pool.data() + offset; - for (uint32_t j = 0; j != size; ++j) { - auto term_id = c[j]; - uint8_t len = m_dictionary.extract(term_id, decoded); - decoded += len; - offset += len; - if (j != size - 1) { - *decoded++ = ' '; - offset++; - } - } - m_pool.push_back_offset(offset); - } - assert(m_pool.size() == num_completions); - return m_pool.begin(); - } -}; -} // namespace autocomplete \ No newline at end of file diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp deleted file mode 100644 index 8b3d882..0000000 --- a/include/autocomplete4.hpp +++ /dev/null @@ -1,322 +0,0 @@ -#pragma once - -#include "util_types.hpp" -#include "building_util.hpp" -#include "compact_vector.hpp" -#include "autocomplete_common.hpp" -#include "scored_string_pool.hpp" -#include "min_heap.hpp" -#include "constants.hpp" - -namespace autocomplete { - -/* Bast and Weber approach. */ - -template -struct autocomplete4 { - typedef scored_string_pool::iterator iterator_type; - - autocomplete4() { - m_pool.resize(constants::POOL_SIZE, constants::MAX_K); - m_topk_completion_set.resize(constants::MAX_K, - 2 * constants::MAX_NUM_TERMS_PER_QUERY); - } - - autocomplete4(parameters const& params, float c) - : autocomplete4() { - typename Completions::builder cm_builder(params); - typename Dictionary::builder di_builder(params); - typename BlockedInvertedIndex::builder ii_builder(params, c); - - auto const& doc_ids = cm_builder.doc_ids(); - m_unsorted_docs_list.build(doc_ids); - - { - essentials::logger("building map from doc_id to lex_id..."); - uint64_t n = doc_ids.size(); - typedef std::vector> id_map_type; - id_map_type ids; - ids.reserve(n); - for (id_type lex_id = 0; lex_id != n; ++lex_id) { - ids.emplace_back(lex_id, doc_ids[lex_id]); - } - std::sort(ids.begin(), ids.end(), [](auto const& l, auto const& r) { - return l.second < r.second; - }); - m_docid_to_lexid.build( - util::first_iterator( - ids.begin()), - ids.size()); - essentials::logger("DONE"); - } - - cm_builder.build(m_completions); - di_builder.build(m_dictionary); - ii_builder.build(m_inverted_index); - } - - iterator_type prefix_topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix; - parse(m_dictionary, query, prefix, suffix); - - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - if (r.is_invalid()) return m_pool.begin(); - - uint32_t num_completions = - m_unsorted_docs_list.topk(r, k, m_pool.scores()); - extract_completions(num_completions); - return extract_strings(num_completions); - } - - iterator_type conjunctive_topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix; - parse(m_dictionary, query, prefix, suffix); - - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - uint32_t num_completions = - conjunctive_topk(prefix, suffix_lex_range, k, m_pool.scores()); - extract_completions(num_completions); - return extract_strings(num_completions); - } - - iterator_type topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix; - parse(m_dictionary, query, prefix, suffix); - - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - - uint32_t num_completions = 0; - if (!r.is_invalid()) { - num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - } - - if (num_completions < k) { - num_completions = conjunctive_topk(prefix, suffix_lex_range, k); - } - - extract_completions(num_completions); - return extract_strings(num_completions); - } - - iterator_type topk(std::string const& query, const uint32_t k, - std::vector& timers) { - assert(k <= constants::MAX_K); - - timers[0].start(); - init(); - completion_type prefix; - byte_range suffix; - parse(m_dictionary, query, prefix, suffix); - timers[0].stop(); - - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - uint32_t num_completions = 0; - if (!r.is_invalid()) { - num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - } - timers[1].stop(); - - timers[2].start(); - if (num_completions < k) { - num_completions = conjunctive_topk(prefix, suffix_lex_range, k); - } - timers[2].stop(); - - timers[3].start(); - extract_completions(num_completions); - auto it = extract_strings(num_completions); - timers[3].stop(); - - return it; - } - - // for benchmarking - iterator_type prefix_topk(std::string const& query, uint32_t const k, - std::vector& timers) { - // step 0 - timers[0].start(); - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix{0, 0}; - parse(m_dictionary, query, prefix, suffix); - timers[0].stop(); - - // step 1 - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - if (r.is_invalid()) return m_pool.begin(); - timers[1].stop(); - - // step 2 - timers[2].start(); - uint32_t num_completions = - m_unsorted_docs_list.topk(r, k, m_pool.scores()); - timers[2].stop(); - - // step 3 - timers[3].start(); - extract_completions(num_completions); - auto it = extract_strings(num_completions); - timers[3].stop(); - - return it; - } - - // for benchmarking - iterator_type conjunctive_topk(std::string const& query, uint32_t const k, - std::vector& timers) { - // step 0 - timers[0].start(); - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix{0, 0}; - parse(m_dictionary, query, prefix, suffix); - timers[0].stop(); - - uint32_t num_completions = 0; - - // step 1 - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - timers[1].stop(); - - // step 2 - timers[2].start(); - num_completions = - conjunctive_topk(prefix, suffix_lex_range, k, m_pool.scores()); - timers[2].stop(); - - // step 3 - timers[3].start(); - extract_completions(num_completions); - auto it = extract_strings(num_completions); - timers[3].stop(); - - return it; - } - - size_t bytes() const { - return m_completions.bytes() + m_unsorted_docs_list.bytes() + - m_dictionary.bytes() + m_docid_to_lexid.bytes() + - m_inverted_index.bytes(); - } - - void print_stats() const; - - template - void visit(Visitor& visitor) { - visitor.visit(m_completions); - visitor.visit(m_unsorted_docs_list); - visitor.visit(m_dictionary); - visitor.visit(m_inverted_index); - visitor.visit(m_docid_to_lexid); - } - -private: - Completions m_completions; - UnsortedDocsList m_unsorted_docs_list; - Dictionary m_dictionary; - BlockedInvertedIndex m_inverted_index; - compact_vector m_docid_to_lexid; - - scored_string_pool m_pool; - completion_set m_topk_completion_set; - - void init() { - m_pool.clear(); - m_pool.init(); - assert(m_pool.size() == 0); - } - - // NOTE: this can be done more efficienctly exploiting - // the fact that the strings to be extracted share a common - // prefix, thus this task should be delegated to the - // integer_fc_dictionary... (enchance the locality of the operation) - // NOTE: this only work when used during the prefix_topk step. - void extract_completions(const uint32_t num_completions) { - auto const& topk_scores = m_pool.scores(); - auto& completions = m_topk_completion_set.completions(); - auto& sizes = m_topk_completion_set.sizes(); - for (uint32_t i = 0; i != num_completions; ++i) { - auto doc_id = topk_scores[i]; - auto lex_id = m_docid_to_lexid[doc_id]; - uint8_t size = m_completions.extract(lex_id, completions[i]); - sizes[i] = size; - } - } - - uint32_t conjunctive_topk(completion_type& prefix, const range suffix, - const uint32_t k) { - auto& topk_scores = m_pool.scores(); - auto it = m_inverted_index.intersection_iterator(prefix, suffix); - uint32_t results = 0; - for (; it.has_next(); ++it) { - auto doc_id = *it; - if (it.intersects()) { - topk_scores[results++] = doc_id; - if (results == k) break; - } - } - return results; - } - - iterator_type extract_strings(const uint32_t num_completions) { - auto const& completions = m_topk_completion_set.completions(); - auto const& sizes = m_topk_completion_set.sizes(); - for (uint32_t i = 0; i != num_completions; ++i) { - auto const& c = completions[i]; - uint32_t size = sizes[i]; - uint64_t offset = m_pool.bytes(); - uint8_t* decoded = m_pool.data() + offset; - for (uint32_t j = 0; j != size; ++j) { - auto term_id = c[j]; - uint8_t len = m_dictionary.extract(term_id, decoded); - decoded += len; - offset += len; - if (j != size - 1) { - *decoded++ = ' '; - offset++; - } - } - m_pool.push_back_offset(offset); - } - assert(m_pool.size() == num_completions); - return m_pool.begin(); - } -}; -} // namespace autocomplete \ No newline at end of file diff --git a/include/autocomplete_common.hpp b/include/autocomplete_common.hpp deleted file mode 100644 index c04f8b6..0000000 --- a/include/autocomplete_common.hpp +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once - -#include "util_types.hpp" - -namespace autocomplete { - -template -uint32_t parse(Dictionary const& dict, std::string const& query, - completion_type& prefix, byte_range& suffix) { - uint32_t num_terms = 1; - byte_range_iterator it(string_to_byte_range(query)); - while (true) { - suffix = it.next(); - if (!it.has_next()) break; - auto term_id = dict.locate(suffix); - prefix.push_back(term_id); - ++num_terms; - } - return num_terms; -} - -} // namespace autocomplete \ No newline at end of file diff --git a/include/building_util.hpp b/include/building_util.hpp deleted file mode 100644 index 17427b6..0000000 --- a/include/building_util.hpp +++ /dev/null @@ -1,62 +0,0 @@ -#pragma once - -#include "bit_vector.hpp" - -namespace autocomplete { -namespace util { - -void push_pad(bit_vector_builder& bvb, uint64_t alignment = 8) { - uint64_t mod = bvb.size() % alignment; - if (mod) { - uint64_t pad = alignment - mod; - bvb.append_bits(0, pad); - assert(bvb.size() % alignment == 0); - } -} - -void eat_pad(bits_iterator& it, uint64_t alignment = 8) { - uint64_t mod = it.position() % alignment; - if (mod) { - uint64_t pad = alignment - mod; - it.get_bits(pad); - assert(it.position() % alignment == 0); - } -} - -template -struct first_iterator - : std::iterator { - first_iterator(Iterator it, uint64_t state = 0) - : m_it(it) - , m_state(state) {} - - typename Iterator::value_type::first_type operator*() { - return (*m_it).first; - } - - first_iterator& operator++() { - m_it += 1; - m_state += 1; - return *this; - } - - first_iterator operator+(uint64_t n) { - return {m_it + n, m_state + n}; - } - - bool operator==(first_iterator const& other) const { - return m_state == other.m_state; - } - - bool operator!=(first_iterator const& other) const { - return !(*this == other); - } - -private: - Iterator m_it; - uint64_t m_state; -}; - -} // namespace util -} // namespace autocomplete \ No newline at end of file diff --git a/include/delta_forward_index.hpp b/include/delta_forward_index.hpp deleted file mode 100644 index 6a302ab..0000000 --- a/include/delta_forward_index.hpp +++ /dev/null @@ -1,149 +0,0 @@ -#pragma once - -#include "parameters.hpp" -#include "bit_vector.hpp" -#include "ef/ef_sequence.hpp" - -namespace autocomplete { - -struct delta_forward_index { - struct builder { - builder() {} - - builder(parameters const& params) - : m_num_integers(0) - , m_num_terms(params.num_terms) { - essentials::logger("building forward_index..."); - uint64_t num_completions = params.num_completions; - std::ifstream input( - (params.collection_basename + ".forward").c_str(), - std::ios_base::in); - m_pointers.push_back(0); - for (uint64_t i = 0; i != num_completions; ++i) { - uint32_t n = 0; - input >> n; - assert(n > 0 and n < constants::MAX_NUM_TERMS_PER_QUERY); - write_gamma_nonzero(m_data, n); - m_num_integers += n; - for (uint64_t k = 0; k != n; ++k) { - id_type x; - input >> x; - write_delta(m_data, x); - } - m_pointers.push_back(m_data.size()); - } - m_pointers.pop_back(); - input.close(); - essentials::logger("DONE"); - } - - void swap(delta_forward_index::builder& other) { - std::swap(other.m_num_integers, m_num_integers); - std::swap(other.m_num_terms, m_num_terms); - other.m_pointers.swap(m_pointers); - other.m_data.swap(m_data); - } - - void build(delta_forward_index& fi) { - fi.m_num_integers = m_num_integers; - fi.m_num_terms = m_num_terms; - fi.m_pointers.build(m_pointers); - fi.m_data.build(&m_data); - builder().swap(*this); - } - - private: - uint64_t m_num_integers; - uint64_t m_num_terms; - std::vector m_pointers; - bit_vector_builder m_data; - }; - - delta_forward_index() {} - - struct forward_list_iterator_type { - forward_list_iterator_type(bits_iterator const& it, - uint64_t n) - : m_it(it) - , m_n(n) - , m_i(0) {} - - uint64_t size() const { - return m_n; - } - - void operator++() { - m_i += 1; - } - - id_type operator*() { - return read_delta(m_it); - } - - bool intersects(const range r) { - for (uint64_t i = 0; i != size(); ++i) { - auto val = operator*(); - if (r.contains(val)) return true; - } - return false; - } - - private: - bits_iterator m_it; - uint64_t m_n; - uint64_t m_i; - }; - - forward_list_iterator_type iterator(id_type doc_id) { - uint64_t offset = m_pointers.access(doc_id); - bits_iterator it(m_data, offset); - uint64_t n = read_gamma_nonzero(it); - return {it, n}; - } - - bool intersects(const id_type doc_id, const range r) { - return iterator(doc_id).intersects(r); - } - - uint64_t num_integers() const { - return m_num_integers; - } - - uint64_t num_terms() const { - return m_num_terms; - } - - uint64_t num_docs() const { - return m_pointers.size(); - } - - size_t data_bytes() const { - return m_data.bytes(); - } - - size_t pointer_bytes() const { - return m_pointers.bytes(); - } - - size_t bytes() const { - return essentials::pod_bytes(m_num_integers) + - essentials::pod_bytes(m_num_terms) + m_pointers.bytes() + - m_data.bytes(); - } - - template - void visit(Visitor& visitor) { - visitor.visit(m_num_integers); - visitor.visit(m_num_terms); - visitor.visit(m_pointers); - visitor.visit(m_data); - } - -private: - uint64_t m_num_integers; - uint64_t m_num_terms; - ef::ef_sequence m_pointers; - bit_vector m_data; -}; - -} // namespace autocomplete \ No newline at end of file diff --git a/include/forward_index.hpp b/include/forward_index.hpp deleted file mode 100644 index 51c7c63..0000000 --- a/include/forward_index.hpp +++ /dev/null @@ -1,201 +0,0 @@ -#pragma once - -#include "parameters.hpp" -#include "integer_codes.hpp" -#include "building_util.hpp" -#include "ef/ef_sequence.hpp" - -namespace autocomplete { - -template -struct forward_index { - typedef ListType forward_list_type; - typedef typename forward_list_type::iterator forward_list_iterator_type; - typedef uncompressed_list permutation_list_type; - typedef - typename permutation_list_type::iterator permutation_list_iterator_type; - - struct builder { - builder() {} - - builder(parameters const& params) - : m_num_integers(0) - , m_num_terms(params.num_terms) { - essentials::logger("building forward_index..."); - - uint64_t num_completions = params.num_completions; - - std::ifstream input( - (params.collection_basename + ".forward").c_str(), - std::ios_base::in); - - std::vector list; - std::vector sorted_permutation; - std::vector permutation; - - m_pointers.push_back(0); - - for (uint64_t i = 0; i != num_completions; ++i) { - list.clear(); - sorted_permutation.clear(); - permutation.clear(); - - uint32_t n = 0; - input >> n; - assert(n > 0 and n < constants::MAX_NUM_TERMS_PER_QUERY); - m_num_integers += n; - list.reserve(n); - sorted_permutation.reserve(n); - - for (uint64_t k = 0; k != n; ++k) { - id_type x; - input >> x; - list.push_back(x); - sorted_permutation.push_back(k); - } - - write_gamma_nonzero(m_bvb, n); - if (ListType::is_byte_aligned) util::push_pad(m_bvb); - - std::sort( - sorted_permutation.begin(), sorted_permutation.end(), - [&](id_type l, id_type r) { return list[l] < list[r]; }); - - permutation.resize(n); - for (uint32_t i = 0; i != n; ++i) { - permutation[sorted_permutation[i]] = i; - } - - std::sort(list.begin(), list.end()); - forward_list_type::build(m_bvb, list.begin(), m_num_terms + 1, - n); - util::push_pad(m_bvb); - m_pointers.push_back(m_bvb.size()); - - permutation_list_type::build(m_bvb, permutation.begin(), n + 1, - n); - m_pointers.push_back(m_bvb.size()); - } - - m_pointers.pop_back(); - input.close(); - essentials::logger("DONE"); - } - - void swap(forward_index::builder& other) { - std::swap(other.m_num_integers, m_num_integers); - std::swap(other.m_num_terms, m_num_terms); - other.m_pointers.swap(m_pointers); - other.m_bvb.swap(m_bvb); - } - - void build(forward_index& fi) { - fi.m_num_integers = m_num_integers; - fi.m_num_terms = m_num_terms; - fi.m_pointers.build(m_pointers); - fi.m_data.build(&m_bvb); - builder().swap(*this); - } - - private: - uint64_t m_num_integers; - uint64_t m_num_terms; - std::vector m_pointers; - bit_vector_builder m_bvb; - }; - - forward_index() {} - - bool intersects(id_type doc_id, range r) { - return get(doc_id).intersects(r); - } - - struct permuting_iterator_type { - permuting_iterator_type(forward_list_iterator_type const& sorted, - permutation_list_iterator_type const& permuted) - : m_i(0) - , m_sorted(sorted) - , m_permuted(permuted) { - assert(sorted.size() == permuted.size()); - } - - uint32_t size() const { - return m_sorted.size(); - } - - id_type operator*() { - return m_sorted.access(m_permuted.access(m_i)); - } - - void operator++() { - ++m_i; - } - - private: - uint32_t m_i; - forward_list_iterator_type m_sorted; - permutation_list_iterator_type m_permuted; - }; - - permuting_iterator_type iterator(id_type doc_id) { - uint64_t offset = m_pointers.access(doc_id * 2); - bits_iterator it(m_data, offset); - uint64_t n = read_gamma_nonzero(it); - if (ListType::is_byte_aligned) util::eat_pad(it); - forward_list_iterator_type it_sorted(m_data, it.position(), - m_num_terms + 1, n); - offset = m_pointers.access(doc_id * 2 + 1); - permutation_list_iterator_type it_permutation(m_data, offset, n + 1, n); - return permuting_iterator_type(it_sorted, it_permutation); - } - - uint64_t num_integers() const { - return m_num_integers; - } - - uint64_t num_terms() const { - return m_num_terms; - } - - uint64_t num_docs() const { - return m_pointers.size(); - } - - size_t data_bytes() const { - return m_data.bytes(); - } - - size_t pointer_bytes() const { - return m_pointers.bytes(); - } - - size_t bytes() const { - return essentials::pod_bytes(m_num_integers) + - essentials::pod_bytes(m_num_terms) + m_pointers.bytes() + - m_data.bytes(); - } - - template - void visit(Visitor& visitor) { - visitor.visit(m_num_integers); - visitor.visit(m_num_terms); - visitor.visit(m_pointers); - visitor.visit(m_data); - } - -private: - uint64_t m_num_integers; - uint64_t m_num_terms; - ef::ef_sequence m_pointers; - bit_vector m_data; - - forward_list_iterator_type get(id_type doc_id) { - uint64_t offset = m_pointers.access(doc_id * 2); - bits_iterator it(m_data, offset); - uint64_t n = read_gamma_nonzero(it); - if (ListType::is_byte_aligned) util::eat_pad(it); - return {m_data, it.position(), m_num_terms + 1, n}; - } -}; - -} // namespace autocomplete \ No newline at end of file diff --git a/include/types.hpp b/include/types.hpp deleted file mode 100644 index 1083cfc..0000000 --- a/include/types.hpp +++ /dev/null @@ -1,80 +0,0 @@ -#pragma once - -#include "completion_trie.hpp" -#include "fc_dictionary.hpp" -#include "integer_fc_dictionary.hpp" -#include "uint_vec.hpp" -#include "unsorted_list.hpp" -#include "uncompressed_list.hpp" - -#include "forward_index.hpp" -#include "compact_forward_index.hpp" -#include "delta_forward_index.hpp" - -#include "inverted_index.hpp" -#include "blocked_inverted_index.hpp" - -#include "autocomplete.hpp" -#include "autocomplete2.hpp" -#include "autocomplete3.hpp" -#include "autocomplete4.hpp" - -#include "compact_vector.hpp" -#include "ef/ef_sequence.hpp" -#include "ef/compact_ef.hpp" -#include "succinct_rmq/cartesian_tree.hpp" - -namespace autocomplete { - -typedef uint_vec uint32_vec; -typedef uint_vec uint64_vec; - -// typedef completion_trie -// uint64_completion_trie; - -typedef completion_trie - ef_completion_trie; - -typedef fc_dictionary<> fc_dictionary_type; -typedef integer_fc_dictionary<> integer_fc_dictionary_type; - -typedef unsorted_list succinct_rmq; -typedef uncompressed_list uncompressed_list32_t; - -// typedef inverted_index uncompressed_inverted_index; -typedef inverted_index ef_inverted_index; - -// typedef forward_index uncompressed_forward_index; -// typedef forward_index ef_forward_index; - -// typedef blocked_inverted_index -// uncompressed_blocked_inverted_index; -typedef blocked_inverted_index ef_blocked_inverted_index; - -// typedef autocomplete -// uncompressed_autocomplete_type; - -// typedef autocomplete2 -// uncompressed_autocomplete_type2; - -/* compressed indexes */ -typedef autocomplete - ef_autocomplete_type1; - -typedef autocomplete2 - ef_autocomplete_type2; - -typedef autocomplete3 - ef_autocomplete_type3; - -typedef autocomplete4 - ef_autocomplete_type4; -} // namespace autocomplete \ No newline at end of file diff --git a/results/README.md b/results/README.md deleted file mode 100644 index 7e6ba77..0000000 --- a/results/README.md +++ /dev/null @@ -1,22 +0,0 @@ -Test machine ------------- - -4 Intel i7-7700 cores (@3.6 GHz); 64 GB of RAM DDR3 (@2.133 GHz); running Linux 4.4.0 (64 bits); 32K for both instruction and data L1 cache; 256K for L2 cache; 8192K for L3 cache. - -Compiler --------- - -gcc 7.4.0 - -`cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=OFF -DUSE_INTRINSICS=ON -DUSE_PDEP=ON` - - -Experiments ------------ - -- The file `space.md` reports the space breakdowns. -- The file `prefix_topk.md` reports the timing breakdowns for the prefix_topk step by varying the number of query terms. -- The file `conjunctive_topk.md` reports the timing breakdowns for the conjunctive_topk step by varying the number of query terms. -- The file `topk.md` reports the total time of the `topk` operation (combining the two steps, `prefix_topk` and `conjunctive_topk`) by varying the number of query terms. -- The file `fc_dictionary.md` reports on the `fc_dictionary` benchmark. -- The file `integer_fc_dictionary.md` reports on the `integer_fc_dictionary` benchmark. \ No newline at end of file diff --git a/results/conjunctive_topk.md b/results/conjunctive_topk.md deleted file mode 100644 index 3d9747b..0000000 --- a/results/conjunctive_topk.md +++ /dev/null @@ -1,107 +0,0 @@ -Conjunctive top-k ------------------ - -Executing queries shuffled at random, for k = 7. - -Average among 10 runs. - -From the last token of the query, we only retain the first character. This means that we spend less in obtaining the lexicographic range of the character (string comparisons are -very fast), but we spend more on the RMQ phase, because the -range obtained from the completion trie can be very large. - -### AOL - -#### Solution 1 - - {"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "dictionary_search_ns_per_query": "3", "conjunctive_search_ns_per_query": "2896", "reporting_ns_per_query": "352"} - {"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "52", "dictionary_search_ns_per_query": "10", "conjunctive_search_ns_per_query": "2273", "reporting_ns_per_query": "2333"} - {"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "919", "dictionary_search_ns_per_query": "39", "conjunctive_search_ns_per_query": "20478", "reporting_ns_per_query": "1772"} - {"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1298", "dictionary_search_ns_per_query": "49", "conjunctive_search_ns_per_query": "27363", "reporting_ns_per_query": "974"} - {"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1857", "dictionary_search_ns_per_query": "42", "conjunctive_search_ns_per_query": "25484", "reporting_ns_per_query": "556"} - {"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "2239", "dictionary_search_ns_per_query": "34", "conjunctive_search_ns_per_query": "22070", "reporting_ns_per_query": "438"} - {"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2871", "dictionary_search_ns_per_query": "32", "conjunctive_search_ns_per_query": "18657", "reporting_ns_per_query": "465"} - {"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3774", "dictionary_search_ns_per_query": "30", "conjunctive_search_ns_per_query": "13967", "reporting_ns_per_query": "844"} - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "4463"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "6677"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "25503"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "31536"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "29973"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "27148"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "23630"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "20511"} - -If we do not check the forward index (thus erronously reporting the first k docids of the intersection), we have: - - {"num_terms_per_query": "3", "num_queries": "50000", "conjunctive_search_ns_per_query": "10362"} - {"num_terms_per_query": "4", "num_queries": "50000", "conjunctive_search_ns_per_query": "21327"} - {"num_terms_per_query": "5", "num_queries": "50000", "conjunctive_search_ns_per_query": "23187"} - {"num_terms_per_query": "6", "num_queries": "50000", "conjunctive_search_ns_per_query": "21259"} - {"num_terms_per_query": "7", "num_queries": "50000", "conjunctive_search_ns_per_query": "18234"} - {"num_terms_per_query": "8+", "num_queries": "50000", "conjunctive_search_ns_per_query": "13912"} - -We can see that the time for the `conjunctive_search` remains the same, except for the case with 3 terms. -This suggests that the time needed to check the forward index is negligible compared to the one -needed to produce the intersection. This can also be observed considering that the time for the case with 2 terms is very small: in this case we check the forward index for each doc in the inverted list of the first term. - -#### Solution 2 - - {"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "dictionary_search_ns_per_query": "6", "conjunctive_search_ns_per_query": "3275", "reporting_ns_per_query": "330"} - {"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "109", "dictionary_search_ns_per_query": "36", "conjunctive_search_ns_per_query": "15770", "reporting_ns_per_query": "2485"} - {"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "932", "dictionary_search_ns_per_query": "52", "conjunctive_search_ns_per_query": "24290", "reporting_ns_per_query": "1780"} - {"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1388", "dictionary_search_ns_per_query": "55", "conjunctive_search_ns_per_query": "29056", "reporting_ns_per_query": "953"} - {"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1880", "dictionary_search_ns_per_query": "41", "conjunctive_search_ns_per_query": "26675", "reporting_ns_per_query": "541"} - {"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "2277", "dictionary_search_ns_per_query": "43", "conjunctive_search_ns_per_query": "22955", "reporting_ns_per_query": "421"} - {"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2762", "dictionary_search_ns_per_query": "37", "conjunctive_search_ns_per_query": "19437", "reporting_ns_per_query": "443"} - {"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3878", "dictionary_search_ns_per_query": "40", "conjunctive_search_ns_per_query": "14657", "reporting_ns_per_query": "814"} - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "4917"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "20361"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "28619"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "33140"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "30410"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "27477"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "24357"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "21042"} - -### MSN - -#### Solution 1 - - {"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "dictionary_search_ns_per_query": "6", "conjunctive_search_ns_per_query": "3021", "reporting_ns_per_query": "576"} - {"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "39", "dictionary_search_ns_per_query": "7", "conjunctive_search_ns_per_query": "2279", "reporting_ns_per_query": "1926"} - {"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "810", "dictionary_search_ns_per_query": "15", "conjunctive_search_ns_per_query": "12382", "reporting_ns_per_query": "1078"} - {"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1104", "dictionary_search_ns_per_query": "15", "conjunctive_search_ns_per_query": "13534", "reporting_ns_per_query": "526"} - {"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1737", "dictionary_search_ns_per_query": "11", "conjunctive_search_ns_per_query": "11424", "reporting_ns_per_query": "305"} - {"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "2049", "dictionary_search_ns_per_query": "10", "conjunctive_search_ns_per_query": "9565", "reporting_ns_per_query": "252"} - {"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2396", "dictionary_search_ns_per_query": "9", "conjunctive_search_ns_per_query": "8020", "reporting_ns_per_query": "324"} - {"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3431", "dictionary_search_ns_per_query": "9", "conjunctive_search_ns_per_query": "6199", "reporting_ns_per_query": "738"} - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "4982"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "6176"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "16236"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "17306"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "15591"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "13961"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "12980"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "12311"} - -#### Solution 2 - - {"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "dictionary_search_ns_per_query": "6", "conjunctive_search_ns_per_query": "3722", "reporting_ns_per_query": "511"} - {"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "56", "dictionary_search_ns_per_query": "20", "conjunctive_search_ns_per_query": "15134", "reporting_ns_per_query": "2043"} - {"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "835", "dictionary_search_ns_per_query": "20", "conjunctive_search_ns_per_query": "15310", "reporting_ns_per_query": "1072"} - {"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1117", "dictionary_search_ns_per_query": "19", "conjunctive_search_ns_per_query": "14672", "reporting_ns_per_query": "517"} - {"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1704", "dictionary_search_ns_per_query": "14", "conjunctive_search_ns_per_query": "12384", "reporting_ns_per_query": "300"} - {"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "2164", "dictionary_search_ns_per_query": "13", "conjunctive_search_ns_per_query": "10222", "reporting_ns_per_query": "246"} - {"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2567", "dictionary_search_ns_per_query": "12", "conjunctive_search_ns_per_query": "8579", "reporting_ns_per_query": "305"} - {"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3670", "dictionary_search_ns_per_query": "12", "conjunctive_search_ns_per_query": "6644", "reporting_ns_per_query": "714"} - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5667"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "19144"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "18886"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "18109"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "16030"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "14423"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "13418"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "12779"} \ No newline at end of file diff --git a/results/fc_dictionary.md b/results/fc_dictionary.md deleted file mode 100644 index 39e64b7..0000000 --- a/results/fc_dictionary.md +++ /dev/null @@ -1,40 +0,0 @@ -#### Results on the AOL querylog. - - pibiri@rubino:~/autocomplete/build$ ./benchmark_fc_dictionary ../test_data/aol/aol.completions 1000000 < ../test_data/aol/aol.completions.dict_queries.1M.shuffled - 2019-10-14 14:54:24: loading queries... - 2019-10-14 14:54:24: loaded 1000000 queries - 2019-10-14 14:54:24: building fc_dictionary with bucket size 4... - 2019-10-14 14:54:25: DONE - using 42938890 bytes - locate: 559.666 [ns/string] - extract: 165.846 [ns/string] - 2019-10-14 14:54:32: building fc_dictionary with bucket size 8... - 2019-10-14 14:54:33: DONE - using 38111527 bytes - locate: 515.359 [ns/string] - extract: 151.121 [ns/string] - 2019-10-14 14:54:40: building fc_dictionary with bucket size 16... - 2019-10-14 14:54:40: DONE - using 35270205 bytes - locate: 474.319 [ns/string] - extract: 138.07 [ns/string] - 2019-10-14 14:54:47: building fc_dictionary with bucket size 32... - 2019-10-14 14:54:47: DONE - using 33722303 bytes - locate: 490 [ns/string] - extract: 150.671 [ns/string] - 2019-10-14 14:54:54: building fc_dictionary with bucket size 64... - 2019-10-14 14:54:54: DONE - using 32910194 bytes - locate: 585.408 [ns/string] - extract: 197.131 [ns/string] - 2019-10-14 14:55:03: building fc_dictionary with bucket size 128... - 2019-10-14 14:55:03: DONE - using 32496375 bytes - locate: 812.441 [ns/string] - extract: 293.022 [ns/string] - 2019-10-14 14:55:15: building fc_dictionary with bucket size 256... - 2019-10-14 14:55:15: DONE - using 32286042 bytes - locate: 1283.83 [ns/string] - extract: 485.985 [ns/string] \ No newline at end of file diff --git a/results/integer_fc_dictionary.md b/results/integer_fc_dictionary.md deleted file mode 100644 index 955afe0..0000000 --- a/results/integer_fc_dictionary.md +++ /dev/null @@ -1,31 +0,0 @@ -#### Results on the AOL querylog. - - pibiri@rubino:~/autocomplete/build$ ./benchmark_integer_fc_dictionary ../test_data/aol/aol.completions 1000000 - 2019-10-14 15:28:12: building integer_fc_dictionary with bucket size 4... - 2019-10-14 15:28:14: DONE - using 129855836 bytes - extract: 102.787 [ns/string] - 2019-10-14 15:28:15: building integer_fc_dictionary with bucket size 8... - 2019-10-14 15:28:18: DONE - using 112779868 bytes - extract: 98.9981 [ns/string] - 2019-10-14 15:28:19: building integer_fc_dictionary with bucket size 16... - 2019-10-14 15:28:21: DONE - using 102740006 bytes - extract: 103.745 [ns/string] - 2019-10-14 15:28:22: building integer_fc_dictionary with bucket size 32... - 2019-10-14 15:28:24: DONE - using 97266766 bytes - extract: 136.042 [ns/string] - 2019-10-14 15:28:26: building integer_fc_dictionary with bucket size 64... - 2019-10-14 15:28:28: DONE - using 94397632 bytes - extract: 207.699 [ns/string] - 2019-10-14 15:28:30: building integer_fc_dictionary with bucket size 128... - 2019-10-14 15:28:32: DONE - using 92933198 bytes - extract: 354.622 [ns/string] - 2019-10-14 15:28:36: building integer_fc_dictionary with bucket size 256... - 2019-10-14 15:28:38: DONE - using 92192244 bytes - extract: 651.357 [ns/string] \ No newline at end of file diff --git a/results/inverted_index_space.md b/results/inverted_index_space.md deleted file mode 100644 index f3acd81..0000000 --- a/results/inverted_index_space.md +++ /dev/null @@ -1,19 +0,0 @@ -Inverted index compression ----- - -#### AOL - - EF -- 17.1495 bits per element - PEF uniform -- 16.5788 bits per element - PEF opt -- 15.0967 bits per element - PFOR -- 15.2661 bits per element - BIC -- 14.1396 bits per element - Simple9 -- 21.8895 bits per element - Simple16 -- 21.7385 bits per element - VByte -- 20.9531 bits per element - Varint -- 21.996 bits per element - Gamma -- 23.6305 bits per element - Delta -- 19.2088 bits per element - Rice -- 19.4145 bits per element - DINT single -- 15.4204 bits per element - DINT multi -- 15.084 bits per element \ No newline at end of file diff --git a/results/prefix_topk.md b/results/prefix_topk.md deleted file mode 100644 index 6404bc4..0000000 --- a/results/prefix_topk.md +++ /dev/null @@ -1,94 +0,0 @@ -Prefix top-k ------------- - -Executing queries shuffled at random, for k = 7. - -Average among 10 runs. - -From the last token of the query, we only retain the first character. This means that we spend less in obtaining the lexicographic range of the character (string comparisons are -very fast), but we spend more on the RMQ phase, because the -range obtained from the completion trie can be very large. - -### AOL - -#### Solution 1 - - {"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "completions_search_ns_per_query": "279", "topk_rmq_ns_per_query": "2887", "reporting_ns_per_query": "317"} - {"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "47", "completions_search_ns_per_query": "853", "topk_rmq_ns_per_query": "576", "reporting_ns_per_query": "1851"} - {"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "706", "completions_search_ns_per_query": "945", "topk_rmq_ns_per_query": "95", "reporting_ns_per_query": "717"} - {"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1055", "completions_search_ns_per_query": "1057", "topk_rmq_ns_per_query": "22", "reporting_ns_per_query": "332"} - {"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1495", "completions_search_ns_per_query": "1215", "topk_rmq_ns_per_query": "9", "reporting_ns_per_query": "325"} - {"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "1957", "completions_search_ns_per_query": "1434", "topk_rmq_ns_per_query": "3", "reporting_ns_per_query": "425"} - {"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2410", "completions_search_ns_per_query": "1581", "topk_rmq_ns_per_query": "2", "reporting_ns_per_query": "611"} - {"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3360", "completions_search_ns_per_query": "1888", "topk_rmq_ns_per_query": "2", "reporting_ns_per_query": "913"} - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5027"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "4974"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "3984"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "4137"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "4660"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "5335"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "5785"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "7394"} - -#### Solution 2 - - {"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "completions_search_ns_per_query": "493", "topk_rmq_ns_per_query": "3072", "reporting_ns_per_query": "628"} - {"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "60", "completions_search_ns_per_query": "1078", "topk_rmq_ns_per_query": "589", "reporting_ns_per_query": "1897"} - {"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "675", "completions_search_ns_per_query": "1053", "topk_rmq_ns_per_query": "96", "reporting_ns_per_query": "730"} - {"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1047", "completions_search_ns_per_query": "1081", "topk_rmq_ns_per_query": "21", "reporting_ns_per_query": "320"} - {"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1367", "completions_search_ns_per_query": "1112", "topk_rmq_ns_per_query": "8", "reporting_ns_per_query": "244"} - {"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "1886", "completions_search_ns_per_query": "1139", "topk_rmq_ns_per_query": "3", "reporting_ns_per_query": "300"} - {"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2242", "completions_search_ns_per_query": "1166", "topk_rmq_ns_per_query": "3", "reporting_ns_per_query": "455"} - {"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3229", "completions_search_ns_per_query": "1205", "topk_rmq_ns_per_query": "2", "reporting_ns_per_query": "809"} - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5768"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "5625"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "4389"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "4421"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "4830"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "5336"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "5963"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "7104"} - -### MSN - -#### Solution 1 - - {"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "completions_search_ns_per_query": "403", "topk_rmq_ns_per_query": "3211", "reporting_ns_per_query": "509"} - {"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "33", "completions_search_ns_per_query": "784", "topk_rmq_ns_per_query": "312", "reporting_ns_per_query": "1287"} - {"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "596", "completions_search_ns_per_query": "906", "topk_rmq_ns_per_query": "49", "reporting_ns_per_query": "423"} - {"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1026", "completions_search_ns_per_query": "1015", "topk_rmq_ns_per_query": "11", "reporting_ns_per_query": "206"} - {"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1434", "completions_search_ns_per_query": "1114", "topk_rmq_ns_per_query": "5", "reporting_ns_per_query": "217"} - {"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "1938", "completions_search_ns_per_query": "1273", "topk_rmq_ns_per_query": "2", "reporting_ns_per_query": "330"} - {"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2362", "completions_search_ns_per_query": "1437", "topk_rmq_ns_per_query": "0", "reporting_ns_per_query": "545"} - {"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3186", "completions_search_ns_per_query": "1737", "topk_rmq_ns_per_query": "1", "reporting_ns_per_query": "873"} - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5804"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "4006"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "3456"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "3873"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "4587"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "5030"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "5617"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "6957"} - -#### Solution 2 - - {"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "completions_search_ns_per_query": "697", "topk_rmq_ns_per_query": "3495", "reporting_ns_per_query": "1114"} - {"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "32", "completions_search_ns_per_query": "1038", "topk_rmq_ns_per_query": "321", "reporting_ns_per_query": "1384"} - {"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "547", "completions_search_ns_per_query": "1029", "topk_rmq_ns_per_query": "51", "reporting_ns_per_query": "455"} - {"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1012", "completions_search_ns_per_query": "1038", "topk_rmq_ns_per_query": "11", "reporting_ns_per_query": "210"} - {"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1318", "completions_search_ns_per_query": "1066", "topk_rmq_ns_per_query": "5", "reporting_ns_per_query": "172"} - {"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "1922", "completions_search_ns_per_query": "1077", "topk_rmq_ns_per_query": "1", "reporting_ns_per_query": "242"} - {"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2213", "completions_search_ns_per_query": "1099", "topk_rmq_ns_per_query": "1", "reporting_ns_per_query": "425"} - {"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3228", "completions_search_ns_per_query": "1124", "topk_rmq_ns_per_query": "0", "reporting_ns_per_query": "799"} - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "6772"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "4646"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "3831"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "4108"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "4594"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "5080"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "5621"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "6775"} \ No newline at end of file diff --git a/results/space.md b/results/space.md deleted file mode 100644 index 64ac1a2..0000000 --- a/results/space.md +++ /dev/null @@ -1,159 +0,0 @@ -AOL 2006 query log ------------------- - -10,142,395 distinct queries, whose ids have been assigned -in decreasing frequency order (ties broken lexicographically). - -#### Solution 1 - - using 1.05555 [GiB] - completions: 0.520278 [GiB] (49.2899%) - unsorted docs list: 0.0409812 [GiB] (3.88246%) - unsorted minimal docs list: 0.0154568 [GiB] (1.46434%) - dictionary: 0.0328479 [GiB] (3.11194%) - inverted index: 0.144273 [GiB] (13.6681%) - data: 33.0401 [bpi] - pointers: 8.13526 [bpi] - forward index: 0.30171 [GiB] (28.5833%) - data: 42.6801 [bpi] - pointers: 42.8379 [bpi] - - - + Elias-Fano - using 0.370675 [GiB] - completions: 0.0867222 [GiB] (23.3958%) - unsorted docs list: 0.0409812 [GiB] (11.0558%) - unsorted minimal docs list: 0.0154568 [GiB] (4.1699%) - dictionary: 0.0328479 [GiB] (8.86166%) - inverted index: 0.0595939 [GiB] (16.0771%) - data: 15.7999 [bpi] - pointers: 1.20819 [bpi] - forward index: 0.135073 [GiB] (36.4397%) - data: 32.866 [bpi] - pointers: 5.41964 [bpi] - - + Elias-Fano and compact_forward_index - using 0.318008 [GiB] - completions: 0.0867222 [GiB] (27.2704%) - unsorted docs list: 0.0409812 [GiB] (12.8868%) - unsorted minimal docs list: 0.0154568 [GiB] (4.86049%) - dictionary: 0.0328479 [GiB] (10.3293%) - inverted index: 0.0595939 [GiB] (18.7397%) - data: 15.7999 [bpi] - pointers: 1.20819 [bpi] - forward index: 0.0824065 [GiB] (25.9133%) - data: 22 [bpi] - pointers: 1.35762 [bpi] - - + Elias-Fano and delta_forward_index - using 0.350595 [GiB] - completions: 0.086722 [GiB] (24.7356%) - unsorted docs list: 0.0409812 [GiB] (11.689%) - unsorted minimal docs list: 0.0154568 [GiB] (4.40872%) - dictionary: 0.0328479 [GiB] (9.36919%) - data: 69.9866 [bps] - pointers: 3.76476 [bps] - inverted index: 0.0595939 [GiB] (16.9979%) - data: 15.7999 [bpi] - pointers: 1.20819 [bpi] - forward index: 0.114994 [GiB] (32.7995%) - data: 29.6008 [bpi] - pointers: 2.99348 [bpi] - - + Elias-Fano + compact_forward_index + compact_unsorted_lists - using 0.304999 [GiB] - completions: 0.086722 [GiB] (28.4335%) - unsorted docs list: 0.0315353 [GiB] (10.3395%) - unsorted minimal docs list: 0.0118937 [GiB] (3.89958%) - dictionary: 0.0328479 [GiB] (10.7698%) - data: 69.9866 [bps] - pointers: 3.76476 [bps] - inverted index: 0.0595939 [GiB] (19.539%) - data: 15.7999 [bpi] - pointers: 1.20819 [bpi] - forward index: 0.0824065 [GiB] (27.0186%) - data: 22 [bpi] - pointers: 1.35762 [bpi] - -#### Solution 2 - - using 0.377843 [GiB] - completions: 0.0956838 [GiB] (25.3237%) - unsorted docs list: 0.0409812 [GiB] (10.8461%) - unsorted minimal docs list: 0.0154568 [GiB] (4.09079%) - dictionary: 0.0330574 [GiB] (8.74898%) - inverted index: 0.154881 [GiB] (40.9907%) - map from docid to lexid: 0.0377834 [GiB] (9.99975%) - - - + Elias-Fano - using 0.259893 [GiB] - completions: 0.0956841 [GiB] (36.8168%) - data: 73.5086 [bps] - pointers: 7.52944 [bps] - unsorted docs list: 0.0315353 [GiB] (12.134%) - unsorted minimal docs list: 0.0118937 [GiB] (4.57639%) - dictionary: 0.0328479 [GiB] (12.639%) - data: 69.9866 [bps] - pointers: 3.76476 [bps] - inverted index: 0.0595939 [GiB] (22.9302%) - data: 15.7999 [bpi] - pointers: 1.20819 [bpi] - map from docid to lexid: 0.0283376 [GiB] (10.9036%) - - -MSN 2006 query log ------------------- - -7,083,363 distinct queries, whose ids have been assigned -in decreasing frequency order (ties broken lexicographically). - -#### Solution 1 - - using 0.769592 [GiB] - completion trie: 0.370163 [GiB] (48.0986%) - unsorted docs list: 0.0286179 [GiB] (3.71858%) - unsorted minimal docs list: 0.0104689 [GiB] (1.36031%) - dictionary: 0.0220881 [GiB] (2.87011%) - inverted index: 0.107578 [GiB] (13.9785%) - forward index: 0.230677 [GiB] (29.9739%) - - + compression - using 0.213269 [GiB] - completions: 0.0617906 [GiB] (28.973%) - unsorted docs list: 0.0211964 [GiB] (9.9388%) - unsorted minimal docs list: 0.00775427 [GiB] (3.6359%) - dictionary: 0.0219463 [GiB] (10.2904%) - data: 68.9954 [bps] - pointers: 3.7648 [bps] - inverted index: 0.0429281 [GiB] (20.1286%) - data: 16.2938 [bpi] - pointers: 1.1785 [bpi] - forward index: 0.0576538 [GiB] (27.0333%) - data: 22 [bpi] - pointers: 1.35605 [bpi] - -#### Solution 2 - - using 0.263256 [GiB] - completions: 0.0681158 [GiB] (25.8744%) - unsorted docs list: 0.0286179 [GiB] (10.8708%) - unsorted minimal docs list: 0.0104689 [GiB] (3.97669%) - dictionary: 0.0220881 [GiB] (8.39036%) - inverted index: 0.107578 [GiB] (40.8643%) - map from docid to lexid: 0.0263876 [GiB] (10.0236%) - - + compression - using 0.180907 [GiB] - completions: 0.0681161 [GiB] (37.6525%) - data: 75.0743 [bps] - pointers: 7.52946 [bps] - unsorted docs list: 0.0211964 [GiB] (11.7167%) - unsorted minimal docs list: 0.00775427 [GiB] (4.28633%) - dictionary: 0.0219463 [GiB] (12.1312%) - data: 68.9954 [bps] - pointers: 3.7648 [bps] - inverted index: 0.0429281 [GiB] (23.7293%) - data: 16.2938 [bpi] - pointers: 1.1785 [bpi] - map from docid to lexid: 0.0189661 [GiB] (10.4839%) \ No newline at end of file diff --git a/results/topk.md b/results/topk.md deleted file mode 100644 index b101b43..0000000 --- a/results/topk.md +++ /dev/null @@ -1,201 +0,0 @@ -Top-k ------------------ - -Executing queries shuffled at random, for k = 7. - -Average among 10 runs. - -### AOL - -#### Solution 1 - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5062"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "6725"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "24960"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "32761"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "31450"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "28812"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "25978"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "22785"} - - + Elias-Fano - {"num_terms_per_query": "1", "num_queries": "10000", "ns_per_query": "5614"} - {"num_terms_per_query": "2", "num_queries": "10000", "ns_per_query": "9767"} - {"num_terms_per_query": "3", "num_queries": "10000", "ns_per_query": "26999"} - {"num_terms_per_query": "4", "num_queries": "10000", "ns_per_query": "35428"} - {"num_terms_per_query": "5", "num_queries": "10000", "ns_per_query": "36073"} - {"num_terms_per_query": "6", "num_queries": "10000", "ns_per_query": "31718"} - {"num_terms_per_query": "7", "num_queries": "10000", "ns_per_query": "29992"} - {"num_terms_per_query": "8+", "num_queries": "10000", "ns_per_query": "27313"} - - + Elias-Fano and forward_index2 - {"num_terms_per_query": "1", "num_queries": "10000", "ns_per_query": "5336"} - {"num_terms_per_query": "2", "num_queries": "10000", "ns_per_query": "7573"} - {"num_terms_per_query": "3", "num_queries": "10000", "ns_per_query": "26278"} - {"num_terms_per_query": "4", "num_queries": "10000", "ns_per_query": "35664"} - {"num_terms_per_query": "5", "num_queries": "10000", "ns_per_query": "35189"} - {"num_terms_per_query": "6", "num_queries": "10000", "ns_per_query": "32033"} - {"num_terms_per_query": "7", "num_queries": "10000", "ns_per_query": "29950"} - {"num_terms_per_query": "8+", "num_queries": "10000", "ns_per_query": "27332"} - -#### Solution 2 - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5812"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "12703"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "27307"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "33476"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "31403"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "28718"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "25728"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "22419"} - - + Elias-Fano - {"num_terms_per_query": "1", "num_queries": "10000", "ns_per_query": "5609"} - {"num_terms_per_query": "2", "num_queries": "10000", "ns_per_query": "10894"} - {"num_terms_per_query": "3", "num_queries": "10000", "ns_per_query": "27311"} - {"num_terms_per_query": "4", "num_queries": "10000", "ns_per_query": "34780"} - {"num_terms_per_query": "5", "num_queries": "10000", "ns_per_query": "33849"} - {"num_terms_per_query": "6", "num_queries": "10000", "ns_per_query": "30319"} - {"num_terms_per_query": "7", "num_queries": "10000", "ns_per_query": "28181"} - {"num_terms_per_query": "8+", "num_queries": "10000", "ns_per_query": "24757"} - -#### Solution 3 - - {"num_terms_per_query": "1", "num_queries": "1000", "ns_per_query": "5899"} - {"num_terms_per_query": "2", "num_queries": "1000", "ns_per_query": "12282007"} - {"num_terms_per_query": "3", "num_queries": "1000", "ns_per_query": "18393403"} - {"num_terms_per_query": "4", "num_queries": "1000", "ns_per_query": "15212918"} - {"num_terms_per_query": "5", "num_queries": "1000", "ns_per_query": "11852012"} - {"num_terms_per_query": "6", "num_queries": "1000", "ns_per_query": "7781194"} - {"num_terms_per_query": "7", "num_queries": "1000", "ns_per_query": "7939661"} - {"num_terms_per_query": "8+", "num_queries": "1000", "ns_per_query": "6980226"} - - + Elias-Fano - {"num_terms_per_query": "1", "num_queries": "1000", "ns_per_query": "6024"} - {"num_terms_per_query": "2", "num_queries": "1000", "ns_per_query": "20553345"} - {"num_terms_per_query": "3", "num_queries": "1000", "ns_per_query": "32495295"} - {"num_terms_per_query": "4", "num_queries": "1000", "ns_per_query": "30929833"} - {"num_terms_per_query": "5", "num_queries": "1000", "ns_per_query": "27103519"} - {"num_terms_per_query": "6", "num_queries": "1000", "ns_per_query": "19912460"} - {"num_terms_per_query": "7", "num_queries": "1000", "ns_per_query": "20956205"} - {"num_terms_per_query": "8+", "num_queries": "1000", "ns_per_query": "19643570"} - -#### Solution 4 - - c = 0.005 - {"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6593"} - {"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "756944"} - {"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "2188766"} - {"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "1920720"} - {"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "2398355"} - {"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "1711205"} - {"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "2195672"} - {"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "2115028"} - - c = 0.01 - {"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6610"} - {"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "739838"} - {"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "2147339"} - {"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "1988980"} - {"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "2440435"} - {"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "1858965"} - {"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "2304761"} - {"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "2254481"} - - c = 0.01, + Elias-Fano - {"num_terms_per_query": "1", "num_queries": "1000", "ns_per_query": "5879"} - {"num_terms_per_query": "2", "num_queries": "1000", "ns_per_query": "1754176"} - {"num_terms_per_query": "3", "num_queries": "1000", "ns_per_query": "3435481"} - {"num_terms_per_query": "4", "num_queries": "1000", "ns_per_query": "4442784"} - {"num_terms_per_query": "5", "num_queries": "1000", "ns_per_query": "4946228"} - {"num_terms_per_query": "6", "num_queries": "1000", "ns_per_query": "4818169"} - {"num_terms_per_query": "7", "num_queries": "1000", "ns_per_query": "5157776"} - {"num_terms_per_query": "8+", "num_queries": "1000", "ns_per_query": "5431935"} - - c = 0.025 - {"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6528"} - {"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "828082"} - {"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "2422803"} - {"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "2482018"} - {"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "2970064"} - {"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "2542134"} - {"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "2972710"} - {"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "2924603"} - - c = 0.05 - {"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6508"} - {"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "1059938"} - {"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "3046716"} - {"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "3528723"} - {"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "4037290"} - {"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "3850329"} - {"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "4371489"} - {"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "4648349"} - - c = 0.1 - {"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6584"} - {"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "1600869"} - {"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "4501125"} - {"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "5562030"} - {"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "6634491"} - {"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "6768321"} - {"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "7124462"} - {"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "7733525"} - - c = 0.2 - {"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6589"} - {"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "2831409"} - {"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "7641806"} - {"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "9881857"} - {"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "11138148"} - {"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "11643908"} - {"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "11966417"} - {"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "12460833"} - -### MSN - -#### Solution 1 - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5823"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "6251"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "16502"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "18380"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "17044"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "15622"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "14709"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "14323"} - -#### Solution 2 - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "6837"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "14469"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "18670"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "19144"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "17109"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "15738"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "14810"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "14260"} - - -#### Solution 3 - - - {"num_terms_per_query": "1", "num_queries": "1000", "ns_per_query": "6666"} - {"num_terms_per_query": "2", "num_queries": "1000", "ns_per_query": "6635754"} - {"num_terms_per_query": "3", "num_queries": "1000", "ns_per_query": "8612266"} - {"num_terms_per_query": "4", "num_queries": "1000", "ns_per_query": "5290905"} - {"num_terms_per_query": "5", "num_queries": "1000", "ns_per_query": "3939319"} - {"num_terms_per_query": "6", "num_queries": "1000", "ns_per_query": "3035556"} - {"num_terms_per_query": "7", "num_queries": "1000", "ns_per_query": "3106875"} - {"num_terms_per_query": "8+", "num_queries": "1000", "ns_per_query": "3089917"} - -#### Solution 4 with c = 0.1 - - {"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "7496"} - {"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "1280652"} - {"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "3181191"} - {"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "3722226"} - {"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "4056810"} - {"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "4130288"} - {"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "4282750"} - {"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "4205507"} \ No newline at end of file diff --git a/script/collect_results.py b/script/collect_results.py deleted file mode 100644 index 9d0dd22..0000000 --- a/script/collect_results.py +++ /dev/null @@ -1,18 +0,0 @@ -import sys, os - -type = sys.argv[1] -exe = sys.argv[2] # prefix_top, conjunctive_topk, topk -dataset_name = sys.argv[3] -k = sys.argv[4] -num_queries = sys.argv[5] -collect_breakdowns = int(sys.argv[6]) # 0 or 1 - -breakdown = "" -if collect_breakdowns != 0: - breakdown = "--breakdown" - -output_filename = dataset_name + "." + exe + ".timings.json" - -for i in range(1, 8): - os.system("../build/benchmark_" + exe + " " + type + " " + k + " ../build/" + dataset_name + ".bin " + str(i) + " " + str(num_queries) + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=" + str(i) + ".shuffled 2>> " + output_filename) -os.system("../build/benchmark_" + exe + " " + type + " " + k + " ../build/" + dataset_name + ".bin 8+ " + str(num_queries) + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=8+.shuffled 2>> " + output_filename) diff --git a/script/collect_results_by_varying_percentage.py b/script/collect_results_by_varying_percentage.py deleted file mode 100644 index cc1b9a0..0000000 --- a/script/collect_results_by_varying_percentage.py +++ /dev/null @@ -1,24 +0,0 @@ -import sys, os - -type = sys.argv[1] -index_filename = sys.argv[2] -dataset_name = sys.argv[3] -k = sys.argv[4] -num_queries = sys.argv[5] -collect_breakdowns = int(sys.argv[6]) # 0 or 1 - -output_filename = dataset_name + "." + type - -breakdown = "" -if collect_breakdowns != 0: - breakdown = "--breakdown" - output_filename += ".breakdown" - -output_filename += ".topk.timings.json" - -percentages = ["0.0", "0.25", "0.50", "0.75"] - -for perc in percentages: - for terms in range(2,8): # (1,8) - os.system("../build/benchmark_topk " + type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=" + str(terms) + ".shuffled 2>> " + output_filename) - os.system("../build/benchmark_topk " + type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=8+.shuffled 2>> " + output_filename) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt deleted file mode 100644 index 0687354..0000000 --- a/test/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -add_executable(test_completion_trie test_completion_trie.cpp) -add_executable(test_fc_dictionary test_fc_dictionary.cpp) -add_executable(test_integer_fc_dictionary test_integer_fc_dictionary.cpp) -add_executable(test_cartesian_tree test_cartesian_tree.cpp) -add_executable(test_inverted_index test_inverted_index.cpp) -add_executable(test_forward_index test_forward_index.cpp) -add_executable(test_unsorted_list test_unsorted_list.cpp) -add_executable(test_autocomplete test_autocomplete.cpp) -add_executable(test_locate_prefix test_locate_prefix.cpp) -add_executable(test_blocked_inverted_index test_blocked_inverted_index.cpp) \ No newline at end of file diff --git a/test/test_autocomplete.cpp b/test/test_autocomplete.cpp deleted file mode 100644 index d4fcefa..0000000 --- a/test/test_autocomplete.cpp +++ /dev/null @@ -1,110 +0,0 @@ -#include - -#include "types.hpp" -#include "statistics.hpp" - -using namespace autocomplete; - -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } - } - - parameters params; - params.collection_basename = argv[1]; - params.load(); - - // typedef uncompressed_autocomplete_type index_type; - // typedef ef_autocomplete_type index_type; - typedef ef_autocomplete_type2 index_type; - - { - index_type index(params); - if (output_filename) { - essentials::logger("saving data structure to disk..."); - essentials::save(index, output_filename); - essentials::logger("DONE"); - } - } - - { - if (output_filename) { - index_type index; - essentials::logger("loading data structure from disk..."); - essentials::load(index, output_filename); - essentials::logger("DONE"); - index.print_stats(); - - { - essentials::logger("testing prefix_topk()..."); - uint32_t k = 7; - std::vector queries = { - "a", "10", "african", - "air", "commercial", "internet", - "paris", "somerset", "the", - "the new", "the perfect", "the starting line", - "yu gi oh", "for sale", "dave mat", - "florence", "florida be", "for s", - "for sa", "for sal", "for sale", - "ford a", "ford au", "ford m", - "ford mu", "for", "fo", - "f", "matt", "fl", - "florir", "fly", "the starting l", - "floridaaa"}; - - for (auto& query : queries) { - auto it = index.prefix_topk(query, k); - std::cout << "top-" << it.size() << " completions for '" - << query << "':\n"; - for (uint32_t i = 0; i != it.size(); ++i, ++it) { - auto completion = *it; - std::cout << "(" << completion.score << ", '"; - print(completion.string); - std::cout << "')" << std::endl; - } - } - - essentials::logger("DONE"); - } - - { - essentials::logger("testing conjunctive_topk()..."); - uint32_t k = 7; - std::vector queries = { - "dave mat", "florence", "florida be", "for s", - "for sa", "for sal", "for sale", "ford a", - "ford au", "ford m", "ford mu", "for", - "fo", "f", "matt", "fl", - "flor", "fly", "the starting l"}; - - for (auto& query : queries) { - auto it = index.conjunctive_topk(query, k); - std::cout << "top-" << it.size() << " completions for '" - << query << "':\n"; - for (uint32_t i = 0; i != it.size(); ++i, ++it) { - auto completion = *it; - std::cout << "(" << completion.score << ", '"; - print(completion.string); - std::cout << "')" << std::endl; - } - } - - essentials::logger("DONE"); - } - } - } - - return 0; -} diff --git a/test/test_blocked_inverted_index.cpp b/test/test_blocked_inverted_index.cpp deleted file mode 100644 index 94fc274..0000000 --- a/test/test_blocked_inverted_index.cpp +++ /dev/null @@ -1,40 +0,0 @@ -#include - -#include "types.hpp" - -using namespace autocomplete; - -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } - } - - parameters params; - params.collection_basename = argv[1]; - params.load(); - const float c = 0.01; - - { - // build, print and write - ef_blocked_inverted_index::builder builder(params, c); - ef_blocked_inverted_index bii; - builder.build(bii); - std::cout << "using " << bii.bytes() << " bytes" << std::endl; - std::cout << "num docs " << bii.num_docs() << std::endl; - std::cout << "num terms " << bii.num_terms() << std::endl; - } - - return 0; -} diff --git a/test/test_cartesian_tree.cpp b/test/test_cartesian_tree.cpp deleted file mode 100644 index 0c4fd38..0000000 --- a/test/test_cartesian_tree.cpp +++ /dev/null @@ -1,83 +0,0 @@ -#include -#include - -#include "types.hpp" - -using namespace autocomplete; - -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } - } - - parameters params; - params.collection_basename = argv[1]; - params.load(); - - { - // build and write - - // std::vector doc_ids = {23, 2, 4, 0, 88, 23, 2, 4, 55, 3, - // 7, 6, 90, 34, 2, 3, 1, 12, 23}; - - std::vector doc_ids; - doc_ids.reserve(params.num_completions); - std::ifstream input(params.collection_basename + ".mapped", - std::ios_base::in); - if (!input.good()) { - throw std::runtime_error("File not found"); - } - completion_iterator it(params, input); - while (input) { - auto const& record = *it; - doc_ids.push_back(record.doc_id); - ++it; - } - input.close(); - - cartesian_tree rmq; - rmq.build(doc_ids, std::less()); - assert(rmq.size() == doc_ids.size()); - std::cout << "using " << rmq.bytes() << " bytes" << std::endl; - - if (output_filename) { - // essentials::print_size(rmq); - essentials::logger("saving data structure to disk..."); - essentials::save(rmq, output_filename); - essentials::logger("DONE"); - } - } - - { - // load and print - if (output_filename) { - cartesian_tree rmq; - essentials::logger("loading data structure from disk..."); - essentials::load(rmq, output_filename); - essentials::logger("DONE"); - - std::cout << "using " << rmq.bytes() << " bytes" << std::endl; - - for (size_t i = 0; i != rmq.size(); ++i) { - for (size_t j = i; j != rmq.size(); ++j) { - std::cout << "rmq[" << i << "," << j - << "] = " << rmq.rmq(i, j) << std::endl; - } - } - } - } - - return 0; -} diff --git a/test/test_completion_trie.cpp b/test/test_completion_trie.cpp deleted file mode 100644 index 1aba989..0000000 --- a/test/test_completion_trie.cpp +++ /dev/null @@ -1,106 +0,0 @@ -#include - -#include "types.hpp" -#include "statistics.hpp" - -using namespace autocomplete; - -struct completion_comparator { - bool operator()(completion_type const& lhs, - completion_type const& rhs) const { - size_t l = 0; // |lcp(lhs,rhs)| - while (l < lhs.size() - 1 and l < rhs.size() - 1 and lhs[l] == rhs[l]) { - ++l; - } - return lhs[l] < rhs[l]; - } -}; - -range locate_prefix(std::vector const& completions, - completion_type const& c) { - completion_comparator comp; - auto b = std::lower_bound(completions.begin(), completions.end(), c, comp); - uint64_t begin = std::distance(completions.begin(), b); - auto e = std::upper_bound(completions.begin() + begin, completions.end(), c, - comp); - uint64_t end = std::distance(completions.begin(), e); - return {begin, end}; -} - -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } - } - - parameters params; - params.collection_basename = argv[1]; - params.load(); - - // typedef uint64_completion_trie completion_trie_type; - typedef ef_completion_trie completion_trie_type; - - { - completion_trie_type::builder builder(params); - completion_trie_type ct; - builder.build(ct); - ct.print_stats(); - - if (output_filename) { - essentials::logger("saving data structure to disk..."); - essentials::save(ct, output_filename); - essentials::logger("DONE"); - } - } - - { - if (output_filename) { - completion_trie_type ct; - essentials::logger("loading data structure from disk..."); - essentials::load(ct, output_filename); - essentials::logger("DONE"); - // essentials::print_size(ct); - std::cout << "using " << ct.bytes() << " bytes" << std::endl; - - std::vector completions; - completions.reserve(params.num_completions); - std::ifstream input(params.collection_basename + ".mapped", - std::ios_base::in); - if (!input.good()) { - throw std::runtime_error("File not found"); - } - - completion_iterator it(params, input); - while (input) { - auto& record = *it; - completions.push_back(std::move(record.completion)); - ++it; - } - input.close(); - - // check all completions - essentials::logger("testing is_member()..."); - for (auto const& c : completions) { - if (!ct.is_member(c)) { - print_completion(c); - std::cout << " not found!" << std::endl; - return 1; - } - } - essentials::logger("DONE..."); - } - } - - return 0; -} diff --git a/test/test_fc_dictionary.cpp b/test/test_fc_dictionary.cpp deleted file mode 100644 index 3f79d1e..0000000 --- a/test/test_fc_dictionary.cpp +++ /dev/null @@ -1,175 +0,0 @@ -#include - -#include "types.hpp" - -using namespace autocomplete; - -id_type locate(std::vector const& terms, std::string const& t) { - return std::distance(terms.begin(), - std::lower_bound(terms.begin(), terms.end(), t)) + - 1; -} - -range locate_prefix(std::vector const& terms, - std::string const& p) { - auto comp_l = [](std::string const& l, std::string const& r) { - if (l.size() < r.size()) { - return strncmp(l.c_str(), r.c_str(), l.size()) <= 0; - } - return strcmp(l.c_str(), r.c_str()) < 0; - }; - - auto comp_r = [](std::string const& l, std::string const& r) { - if (l.size() < r.size()) { - return strncmp(l.c_str(), r.c_str(), l.size()) < 0; - } - return strcmp(l.c_str(), r.c_str()) < 0; - }; - - range r; - r.begin = std::distance( - terms.begin(), std::lower_bound(terms.begin(), terms.end(), p, comp_l)); - r.end = - std::distance(terms.begin(), - std::upper_bound(terms.begin(), terms.end(), p, comp_r)) - - 1; - - return r; -} - -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } - } - - parameters params; - params.collection_basename = argv[1]; - params.load(); - - { - // build, print and write - fc_dictionary_type::builder builder(params); - fc_dictionary_type dict; - builder.build(dict); - std::cout << "using " << dict.bytes() << " bytes" << std::endl; - - if (output_filename) { - // essentials::print_size(dict); - essentials::logger("saving data structure to disk..."); - essentials::save(dict, output_filename); - essentials::logger("DONE"); - } - } - - { - if (output_filename) { - fc_dictionary_type dict; - essentials::logger("loading data structure from disk..."); - essentials::load(dict, output_filename); - essentials::logger("DONE"); - // essentials::print_size(dict); - std::cout << "using " << dict.bytes() << " bytes" << std::endl; - - // test locate() and extract for all strings - std::vector terms; - terms.reserve(params.num_terms); - std::ifstream input((params.collection_basename + ".dict").c_str(), - std::ios_base::in); - if (!input.good()) { - throw std::runtime_error("File not found"); - } - std::string term; - term.reserve(256 + 1); - input >> term; - while (input) { - terms.push_back(std::move(term)); - input >> term; - } - input.close(); - - std::cout << "terms.size() " << terms.size() << std::endl; - - std::vector decoded(2 * - constants::MAX_NUM_CHARS_PER_QUERY); - - for (auto const& t : terms) { - id_type expected = locate(terms, t); - id_type got = dict.locate(string_to_byte_range(t)); - - std::cout << "locating term '" << t << "'" << std::endl; - if (got != expected) { - std::cout << "Error: expected id " << expected << "," - << " but got id " << got << std::endl; - return 1; - } - - std::cout << "extracting term '" << t << "'" << std::endl; - uint8_t string_len = dict.extract(got, decoded.data()); - - if (string_len != t.size()) { - std::cout << "Error: expected size " << t.size() << "," - << " but got size " << string_len << std::endl; - return 1; - } - - auto s = reinterpret_cast(decoded.data()); - for (uint8_t i = 0; i != string_len; ++i) { - if (t[i] != s[i]) { - std::cout << "Error: expected char " << t[i] - << " but got " << s[i] << std::endl; - return 1; - } - } - - std::cout << "lexicographic id of '" << t << "' is " << got - << std::endl; - } - - // test locate_prefix() for all strings - std::string prefix; - prefix.reserve(256 + 1); - for (auto const& t : terms) { - uint32_t n = t.size(); - for (uint32_t prefix_len = 1; prefix_len <= n; ++prefix_len) { - prefix.clear(); - for (uint32_t i = 0; i != prefix_len; ++i) { - prefix.push_back(t[i]); - } - - std::cout << "locating prefix '" << prefix << "'" - << std::endl; - range expected = locate_prefix(terms, prefix); - range got = - dict.locate_prefix(string_to_byte_range(prefix)); - - if ((got.begin != expected.begin) or - (got.end != expected.end)) { - std::cout << "Error for prefix '" << prefix - << "' : expected [" << expected.begin << "," - << expected.end << "] but got [" << got.begin - << "," << got.end << "]" << std::endl; - return 1; - } - - std::cout << "prefix range of '" << prefix << "' is [" - << got.begin << "," << got.end << "]" - << std::endl; - } - } - } - } - - return 0; -} diff --git a/test/test_forward_index.cpp b/test/test_forward_index.cpp deleted file mode 100644 index 576215d..0000000 --- a/test/test_forward_index.cpp +++ /dev/null @@ -1,58 +0,0 @@ -#include - -#include "types.hpp" - -using namespace autocomplete; - -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } - } - - parameters params; - params.collection_basename = argv[1]; - params.load(); - - typedef compact_forward_index forward_index_type; - - { - forward_index_type::builder builder(params); - forward_index_type index; - builder.build(index); - std::cout << "using " << index.bytes() << " bytes" << std::endl; - std::cout << "num docs " << index.num_docs() << std::endl; - std::cout << "num terms " << index.num_terms() << std::endl; - - if (output_filename) { - essentials::logger("saving data structure to disk..."); - essentials::save(index, output_filename); - essentials::logger("DONE"); - } - } - - { - if (output_filename) { - forward_index_type index; - essentials::logger("loading data structure from disk..."); - essentials::load(index, output_filename); - essentials::logger("DONE"); - std::cout << "using " << index.bytes() << " bytes" << std::endl; - std::cout << "num docs " << index.num_docs() << std::endl; - std::cout << "num terms " << index.num_terms() << std::endl; - } - } - - return 0; -} diff --git a/test/test_integer_fc_dictionary.cpp b/test/test_integer_fc_dictionary.cpp deleted file mode 100644 index 4f78052..0000000 --- a/test/test_integer_fc_dictionary.cpp +++ /dev/null @@ -1,155 +0,0 @@ -#include - -#include "types.hpp" - -using namespace autocomplete; - -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } - } - - parameters params; - params.collection_basename = argv[1]; - params.load(); - - { - // build, print and write - integer_fc_dictionary_type::builder builder(params); - integer_fc_dictionary_type dict; - builder.build(dict); - std::cout << "using " << dict.bytes() << " bytes" << std::endl; - - if (output_filename) { - // essentials::print_size(dict); - essentials::logger("saving data structure to disk..."); - essentials::save(dict, output_filename); - essentials::logger("DONE"); - } - } - - { - if (output_filename) { - integer_fc_dictionary_type dict; - essentials::logger("loading data structure from disk..."); - essentials::load(dict, output_filename); - essentials::logger("DONE"); - std::cout << "using " << dict.bytes() << " bytes" << std::endl; - - { - essentials::logger("testing extract() and locate()..."); - std::ifstream input( - (params.collection_basename + ".mapped").c_str(), - std::ios_base::in); - completion_iterator it(params, input); - - completion_type decoded(2 * constants::MAX_NUM_TERMS_PER_QUERY); - for (id_type id = 0; id != params.num_completions; ++id, ++it) { - auto const& expected = (*it).completion; - assert(expected.size() > 0); - uint8_t size = dict.extract(id, decoded); - if (expected.size() - 1 != size) { - std::cout << "Error in decoding the " << id - << "-th string: expected size " - << expected.size() - 1 << "," - << " but got size " << int(size) << std::endl; - return 1; - } - - for (uint8_t i = 0; i != size; ++i) { - if (decoded[i] != expected[i]) { - std::cout - << "Error in decoding the " << id - << "-th string: expected " << expected[i] << "," - << " but got " << decoded[i] << " at position " - << int(i) << std::endl; - return 1; - } - } - - id_type got_id = - dict.locate({decoded.data(), decoded.data() + size}); - if (got_id != id) { - std::cout << "Error in locating the " << id - << "-th string: expected id " << id << "," - << " but got id " << got_id << std::endl; - return 1; - } - } - - input.close(); - essentials::logger("it's all good"); - } - - // { - // uint64_completion_trie::builder builder(params); - // uint64_completion_trie ct; - // builder.build(ct); - // std::cout << "using " << ct.bytes() << " bytes" << std::endl; - - // essentials::logger("testing locate_prefix()..."); - - // std::ifstream input( - // (params.collection_basename + ".mapped").c_str(), - // std::ios_base::in); - // completion_iterator it(params, input); - - // uint32_t num_checks = - // std::min(params.num_completions, 30000); - - // completion_type prefix; - // for (uint32_t i = 0; i != num_checks; ++i, ++it) { - // auto const& expected = (*it).completion; - // assert(expected.size() > 0); - - // for (uint32_t prefix_len = 1; - // prefix_len <= expected.size() - 1; ++prefix_len) { - // prefix.clear(); - // for (uint32_t i = 0; i != prefix_len; ++i) { - // prefix.push_back(expected[i]); - // } - - // range expected = ct.locate_prefix(prefix); - // range got = dict.locate_prefix( - // completion_to_uint32_range(prefix)); - - // if ((got.begin != expected.begin) or - // (got.end != expected.end - 1)) { - // std::cout << "Error for prefix "; - // print_completion(prefix); - // std::cout << ": expected [" << expected.begin << - // "," - // << expected.end - 1 << "] but got [" - // << got.begin << "," << got.end << "]" - // << std::endl; - // return 1; - // } - - // // std::cout << "prefix range of "; - // // print_completion(prefix); - // // std::cout << " is [" << got.begin << "," << - // got.end - // // << "]" << std::endl; - // } - // } - - // input.close(); - // essentials::logger("it's all good"); - // } - } - } - - return 0; -} diff --git a/test/test_inverted_index.cpp b/test/test_inverted_index.cpp deleted file mode 100644 index 81f913e..0000000 --- a/test/test_inverted_index.cpp +++ /dev/null @@ -1,127 +0,0 @@ -#include - -#include "types.hpp" - -using namespace autocomplete; - -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } - } - - parameters params; - params.collection_basename = argv[1]; - params.load(); - - typedef ef_inverted_index inverted_index_type; - - { - // build, print and write - inverted_index_type::builder builder(params); - inverted_index_type index; - builder.build(index); - std::cout << "using " << index.bytes() << " bytes" << std::endl; - std::cout << "num docs " << index.num_docs() << std::endl; - std::cout << "num terms " << index.num_terms() << std::endl; - - if (output_filename) { - essentials::logger("saving data structure to disk..."); - essentials::save(index, output_filename); - essentials::logger("DONE"); - } - } - - { - if (output_filename) { - inverted_index_type index; - essentials::logger("loading data structure from disk..."); - essentials::load(index, output_filename); - essentials::logger("DONE"); - std::cout << "using " << index.bytes() << " bytes" << std::endl; - std::cout << "num docs " << index.num_docs() << std::endl; - std::cout << "num terms " << index.num_terms() << std::endl; - - std::vector intersection(index.num_docs()); // at most - std::vector term_ids; - term_ids.reserve(2); - - // id_type i = 293; - // id_type j = 294; - // id_type i = 899; - // id_type j = 822; - id_type i = 2401599 - 1; - id_type j = 1752198 - 1; - term_ids.push_back(i + 1); - term_ids.push_back(j + 1); - // uint64_t size = index.intersect(term_ids, intersection); - - { - std::cout << "intersection between " << i << " and " << j - << " is: "; - uint32_t i = 0; - auto intersec_it = index.intersection_iterator(term_ids); - while (intersec_it.has_next()) { - id_type doc_id = *intersec_it; - std::cout << doc_id << " "; - ++i; - ++intersec_it; - } - std::cout << std::endl; - } - - std::vector a; - { - auto it = index.iterator(i); - a.resize(it.size()); - for (uint32_t i = 0; i != a.size(); ++i) { - a[i] = it.access(i); - } - } - - std::vector b; - { - auto it = index.iterator(j); - b.resize(it.size()); - for (uint32_t i = 0; i != b.size(); ++i) { - b[i] = it.access(i); - } - } - - auto it = std::set_intersection(a.begin(), a.end(), b.begin(), - b.end(), intersection.begin()); - intersection.resize(it - intersection.begin()); - std::cout << "intersection between " << i << " and " << j - << " is: "; - for (auto x : intersection) { - std::cout << x << " "; - } - std::cout << std::endl; - - // for (uint32_t i = 1; i != index.num_terms() + 1; ++i) { - // for (uint32_t j = i; j != index.num_terms() + 1; ++j) { - // term_ids.clear(); - // term_ids.push_back(i); - // term_ids.push_back(j); - // uint64_t size = index.intersect(term_ids, intersection); - // std::cout << "size of intersection between " << i << " - // and " - // << j << " is " << size << std::endl; - // } - // } - } - } - - return 0; -} diff --git a/test/test_locate_prefix.cpp b/test/test_locate_prefix.cpp deleted file mode 100644 index fd3dcb4..0000000 --- a/test/test_locate_prefix.cpp +++ /dev/null @@ -1,141 +0,0 @@ -#include - -#include "types.hpp" -#include "../benchmark/benchmark_common.hpp" - -using namespace autocomplete; - -range locate_prefix(std::vector const& strings, - std::string const& p) { - auto comp_l = [](std::string const& l, std::string const& r) { - if (l.size() < r.size()) { - return strncmp(l.c_str(), r.c_str(), l.size()) <= 0; - } - return strcmp(l.c_str(), r.c_str()) < 0; - }; - - auto comp_r = [](std::string const& l, std::string const& r) { - if (l.size() < r.size()) { - return strncmp(l.c_str(), r.c_str(), l.size()) < 0; - } - return strcmp(l.c_str(), r.c_str()) < 0; - }; - - range r; - r.begin = std::distance( - strings.begin(), - std::lower_bound(strings.begin(), strings.end(), p, comp_l)); - r.end = std::distance( - strings.begin(), - std::upper_bound(strings.begin(), strings.end(), p, comp_r)); - - return r; -} - -template -int test_locate_prefix(Dictionary const& dict, Index const& index, - std::vector const& queries, - std::vector const& strings) { - for (auto const& query : queries) { - std::string query_copy = query; - range expected = locate_prefix(strings, query); - - // std::cout << "query: '" << query << "'" << std::endl; - completion_type prefix; - byte_range suffix; - parse(dict, query_copy, prefix, suffix); - - // print_completion(prefix); - // std::cout << std::endl; - // print(suffix); - // std::cout << std::endl; - - range suffix_lex_range = dict.locate_prefix(suffix); - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range got = index.locate_prefix(prefix, suffix_lex_range); - - if ((got.begin != expected.begin) or (got.end != expected.end)) { - std::cout << "Error for query '" << query << "': "; - std::cout << "expected [" << expected.begin << "," << expected.end - << ") but got [" << got.begin << "," << got.end << ")" - << std::endl; - return 1; - } - } - - return 0; -} - -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " " << std::endl; - return 1; - } - - parameters params; - params.collection_basename = argv[1]; - params.load(); - - fc_dictionary_type dict; - { - fc_dictionary_type::builder builder(params); - builder.build(dict); - } - - std::vector strings; - - { - essentials::logger("loading all strings..."); - std::string line; - strings.reserve(params.num_completions); - std::ifstream input((params.collection_basename).c_str(), - std::ios_base::in); - for (uint32_t i = 0; i != params.num_completions; ++i) { - if (!std::getline(input, line)) break; - auto s = line.substr(line.find(' ') + 1, line.size()); - strings.push_back(s); - } - input.close(); - essentials::logger("loaded " + std::to_string(strings.size()) + - " strings"); - } - - uint32_t max_num_queries = std::atoi(argv[2]); - std::vector queries; - essentials::logger("loading queries..."); - uint32_t num_queries = - load_queries(queries, max_num_queries, true, std::cin); - essentials::logger("loaded " + std::to_string(num_queries) + " queries"); - - { - // typedef uint64_completion_trie completion_trie_type; - typedef ef_completion_trie completion_trie_type; - - completion_trie_type index; - { - completion_trie_type::builder builder(params); - builder.build(index); - } - essentials::logger("testing locate_prefix() for completion_trie..."); - int ret = test_locate_prefix(dict, index, queries, strings); - if (ret) return 1; - essentials::logger("it's all good"); - } - - { - integer_fc_dictionary_type index; - { - integer_fc_dictionary_type::builder builder(params); - builder.build(index); - } - essentials::logger( - "testing locate_prefix() for integer_fc_dictionary..."); - int ret = test_locate_prefix(dict, index, queries, strings); - if (ret) return 1; - essentials::logger("it's all good"); - } - - return 0; -} diff --git a/test/test_unsorted_list.cpp b/test/test_unsorted_list.cpp deleted file mode 100644 index 9b9b000..0000000 --- a/test/test_unsorted_list.cpp +++ /dev/null @@ -1,148 +0,0 @@ -#include -#include - -#include "types.hpp" - -using namespace autocomplete; - -static const uint32_t max_k = 15; -static const uint32_t k = 10; -static_assert(k <= max_k, "k must be less than max allowed"); -static const uint32_t num_queries = 10000; - -std::vector naive_topk(std::vector const& input, range r, - uint32_t k) { - uint32_t range_len = r.end - r.begin; - std::vector topk(range_len); - for (uint32_t i = 0; i != range_len; ++i) { - topk[i] = input[r.begin + i]; - } - std::sort(topk.begin(), topk.begin() + range_len); - topk.resize(std::min(k, range_len)); - return topk; -} - -std::vector gen_random_queries(uint32_t num_queries, - uint32_t max_range_len) { - std::vector queries; - queries.reserve(num_queries); - essentials::uniform_int_rng random(0, max_range_len); - for (uint32_t i = 0; i != num_queries; ++i) { - uint32_t x = random.gen(); - uint32_t y = random.gen(); - range r; - if (y > x) { - r = {x, y}; - } else { - r = {y, x}; - } - queries.push_back(r); - } - return queries; -} - -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } - } - - parameters params; - params.collection_basename = argv[1]; - params.load(); - - std::vector doc_ids; - - { - // build and write - doc_ids.reserve(params.num_completions); - std::ifstream input(params.collection_basename + ".mapped", - std::ios_base::in); - if (!input.good()) { - throw std::runtime_error("File not found"); - } - completion_iterator it(params, input); - while (input) { - auto const& record = *it; - doc_ids.push_back(record.doc_id); - ++it; - } - input.close(); - - { - // must have all ids from 0 to doc_ids.size() - 1 - std::vector tmp = doc_ids; - std::sort(tmp.begin(), tmp.end()); - for (id_type id = 0; id != doc_ids.size(); ++id) { - if (tmp[id] != id) { - std::cout << "Error: id " << id << " not found" - << std::endl; - return 1; - } - } - } - - succinct_rmq list; - list.build(doc_ids); - assert(list.size() == doc_ids.size()); - std::cout << "using " << list.bytes() << " bytes" << std::endl; - - if (output_filename) { - // essentials::print_size(list); - essentials::logger("saving data structure to disk..."); - essentials::save(list, output_filename); - essentials::logger("DONE"); - } - } - - { - if (output_filename) { - succinct_rmq list; - essentials::logger("loading data structure from disk..."); - essentials::load(list, output_filename); - essentials::logger("DONE"); - - std::cout << "using " << list.bytes() << " bytes" << std::endl; - - std::vector topk(max_k); - auto queries = gen_random_queries(num_queries, doc_ids.size()); - std::cout << "testing top-" << k << " " << num_queries - << " random queries..." << std::endl; - - for (auto q : queries) { - auto expected = naive_topk(doc_ids, q, k); - uint32_t num_elements = list.topk(q, k, topk); - - if (expected.size() != num_elements) { - std::cout << "Error: expected " << expected.size() - << " topk elements but got " << num_elements - << std::endl; - return 1; - } - - for (uint32_t i = 0; i != num_elements; ++i) { - if (topk[i] != expected[i]) { - std::cout << "Error: expected " << expected[i] - << " but got " << topk[i] << std::endl; - return 1; - } - } - } - - std::cout << "it's all good" << std::endl; - } - } - - return 0; -} diff --git a/test_data/partition_queries_by_length.py b/test_data/partition_queries_by_length.py deleted file mode 100644 index f9cb561..0000000 --- a/test_data/partition_queries_by_length.py +++ /dev/null @@ -1,27 +0,0 @@ -import sys -import numpy as np - -input_filename = sys.argv[1] - -num_shards = 7 -files = [open(input_filename + ".length=" + str(i), "w") for i in range(1,num_shards + 1)] -all_others = open(input_filename + ".length=" + str(num_shards + 1) + "+", "w") - -lines = 0 -with open(input_filename, 'r') as f: - for line in f: - x = line.rstrip('\n').split() - l = len(x) - 1 - - if l > num_shards: - all_others.write(line) - else: - files[l - 1].write(line) - - lines += 1 - if lines % 1000000 == 0: - print("processed " + str(lines) + " lines") - -for f in files: - f.close() -all_others.close()