diff --git a/examples/integer_range_search.rs b/examples/integer_range_search.rs index 10f2015c5d..5ab6a7fa21 100644 --- a/examples/integer_range_search.rs +++ b/examples/integer_range_search.rs @@ -1,3 +1,5 @@ +use std::ops::Bound; + // # Searching a range on an indexed int field. // // Below is an example of creating an indexed integer field in your schema @@ -5,7 +7,7 @@ use tantivy::collector::Count; use tantivy::query::RangeQuery; use tantivy::schema::{Schema, INDEXED}; -use tantivy::{doc, Index, IndexWriter, Result}; +use tantivy::{doc, Index, IndexWriter, Result, Term}; fn main() -> Result<()> { // For the sake of simplicity, this schema will only have 1 field @@ -27,7 +29,10 @@ fn main() -> Result<()> { reader.reload()?; let searcher = reader.searcher(); // The end is excluded i.e. here we are searching up to 1969 - let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970); + let docs_in_the_sixties = RangeQuery::new( + Bound::Included(Term::from_field_u64(year_field, 1960)), + Bound::Excluded(Term::from_field_u64(year_field, 1970)), + ); // Uses a Count collector to sum the total number of docs in the range let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?; assert_eq!(num_60s_books, 10); diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 1a83ccca17..c583a4b5c5 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -303,7 +303,7 @@ mod tests_mmap { Type::Str, ), (format!("{field_name_out_internal}a"), Type::Str), - (format!("{field_name_out_internal}"), Type::Str), + (field_name_out_internal.to_string(), Type::Str), (format!("num{field_name_out_internal}"), Type::I64), ]; expected_fields.sort(); diff --git a/src/query/all_query.rs b/src/query/all_query.rs index 149041b043..8da90df23a 100644 --- a/src/query/all_query.rs +++ b/src/query/all_query.rs @@ -22,10 +22,7 @@ pub struct AllWeight; impl Weight for AllWeight { fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { - let all_scorer = AllScorer { - doc: 0u32, - max_doc: reader.max_doc(), - }; + let all_scorer = AllScorer::new(reader.max_doc()); Ok(Box::new(BoostScorer::new(all_scorer, boost))) } @@ -43,6 +40,13 @@ pub struct AllScorer { max_doc: DocId, } +impl AllScorer { + /// Creates a new AllScorer with `max_doc` docs. + pub fn new(max_doc: DocId) -> AllScorer { + AllScorer { doc: 0u32, max_doc } + } +} + impl DocSet for AllScorer { #[inline(always)] fn advance(&mut self) -> DocId { diff --git a/src/query/disjunction.rs b/src/query/disjunction.rs index b2d9d67ffa..81723af9a7 100644 --- a/src/query/disjunction.rs +++ b/src/query/disjunction.rs @@ -192,7 +192,7 @@ mod tests { .cloned() .map(VecDocSet::from) .map(|d| ConstScorer::new(d, 1.0)), - DoNothingCombiner::default(), + DoNothingCombiner, min_match, ) }; diff --git a/src/query/exist_query.rs b/src/query/exist_query.rs index f028ebaa9f..7daacdfef6 100644 --- a/src/query/exist_query.rs +++ b/src/query/exist_query.rs @@ -149,7 +149,7 @@ mod tests { use crate::query::exist_query::ExistsQuery; use crate::query::{BooleanQuery, RangeQuery}; use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT}; - use crate::{Index, Searcher}; + use crate::{Index, Searcher, Term}; #[test] fn test_exists_query_simple() -> crate::Result<()> { @@ -188,9 +188,8 @@ mod tests { // exercise seek let query = BooleanQuery::intersection(vec![ - Box::new(RangeQuery::new_u64_bounds( - "all".to_string(), - Bound::Included(50), + Box::new(RangeQuery::new( + Bound::Included(Term::from_field_u64(all_field, 50)), Bound::Unbounded, )), Box::new(ExistsQuery::new_exists_query("even".to_string())), @@ -198,10 +197,9 @@ mod tests { assert_eq!(searcher.search(&query, &Count)?, 25); let query = BooleanQuery::intersection(vec![ - Box::new(RangeQuery::new_u64_bounds( - "all".to_string(), - Bound::Included(0), - Bound::Excluded(50), + Box::new(RangeQuery::new( + Bound::Included(Term::from_field_u64(all_field, 0)), + Bound::Included(Term::from_field_u64(all_field, 50)), )), Box::new(ExistsQuery::new_exists_query("odd".to_string())), ]); diff --git a/src/query/mod.rs b/src/query/mod.rs index 03c6c01d0d..d57d3eeab1 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -54,7 +54,7 @@ pub use self::phrase_prefix_query::PhrasePrefixQuery; pub use self::phrase_query::PhraseQuery; pub use self::query::{EnableScoring, Query, QueryClone}; pub use self::query_parser::{QueryParser, QueryParserError}; -pub use self::range_query::{FastFieldRangeWeight, IPFastFieldRangeWeight, RangeQuery}; +pub use self::range_query::{FastFieldRangeWeight, RangeQuery}; pub use self::regex_query::RegexQuery; pub use self::reqopt_scorer::RequiredOptionalScorer; pub use self::score_combiner::{ diff --git a/src/query/phrase_prefix_query/phrase_prefix_query.rs b/src/query/phrase_prefix_query/phrase_prefix_query.rs index 8cbbe637e5..d6efe388d5 100644 --- a/src/query/phrase_prefix_query/phrase_prefix_query.rs +++ b/src/query/phrase_prefix_query/phrase_prefix_query.rs @@ -145,15 +145,7 @@ impl Query for PhrasePrefixQuery { Bound::Unbounded }; - let mut range_query = RangeQuery::new_term_bounds( - enable_scoring - .schema() - .get_field_name(self.field) - .to_owned(), - self.prefix.1.typ(), - &Bound::Included(self.prefix.1.clone()), - &end_term, - ); + let mut range_query = RangeQuery::new(Bound::Included(self.prefix.1.clone()), end_term); range_query.limit(self.max_expansions as u64); range_query.weight(enable_scoring) } diff --git a/src/query/query_parser/logical_ast.rs b/src/query/query_parser/logical_ast.rs index ce57b710b7..a9400881b5 100644 --- a/src/query/query_parser/logical_ast.rs +++ b/src/query/query_parser/logical_ast.rs @@ -2,7 +2,7 @@ use std::fmt; use std::ops::Bound; use crate::query::Occur; -use crate::schema::{Term, Type}; +use crate::schema::Term; use crate::Score; #[derive(Clone)] @@ -14,8 +14,6 @@ pub enum LogicalLiteral { prefix: bool, }, Range { - field: String, - value_type: Type, lower: Bound, upper: Bound, }, diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 8ec50f8d4d..81115d2878 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -790,8 +790,6 @@ impl QueryParser { let (field, json_path) = try_tuple!(self .split_full_path(&full_path) .ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone()))); - let field_entry = self.schema.get_field_entry(field); - let value_type = field_entry.field_type().value_type(); let mut errors = Vec::new(); let lower = match self.resolve_bound(field, json_path, &lower) { Ok(bound) => bound, @@ -812,12 +810,8 @@ impl QueryParser { // we failed to parse something. Either way, there is no point emiting it return (None, errors); } - let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range { - field: self.schema.get_field_name(field).to_string(), - value_type, - lower, - upper, - })); + let logical_ast = + LogicalAst::Leaf(Box::new(LogicalLiteral::Range { lower, upper })); (Some(logical_ast), errors) } UserInputLeaf::Set { @@ -884,14 +878,7 @@ fn convert_literal_to_query( Box::new(PhraseQuery::new_with_offset_and_slop(terms, slop)) } } - LogicalLiteral::Range { - field, - value_type, - lower, - upper, - } => Box::new(RangeQuery::new_term_bounds( - field, value_type, &lower, &upper, - )), + LogicalLiteral::Range { lower, upper } => Box::new(RangeQuery::new(lower, upper)), LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)), LogicalLiteral::All => Box::new(AllQuery), } @@ -1136,8 +1123,8 @@ mod test { let query = make_query_parser().parse_query("title:[A TO B]").unwrap(); assert_eq!( format!("{query:?}"), - "RangeQuery { field: \"title\", value_type: Str, lower_bound: Included([97]), \ - upper_bound: Included([98]), limit: None }" + "RangeQuery { lower_bound: Included(Term(field=0, type=Str, \"a\")), upper_bound: \ + Included(Term(field=0, type=Str, \"b\")), limit: None }" ); } diff --git a/src/query/range_query/fast_field_range_query.rs b/src/query/range_query/fast_field_range_doc_set.rs similarity index 96% rename from src/query/range_query/fast_field_range_query.rs rename to src/query/range_query/fast_field_range_doc_set.rs index def8d9c3bf..f30f8620c4 100644 --- a/src/query/range_query/fast_field_range_query.rs +++ b/src/query/range_query/fast_field_range_doc_set.rs @@ -180,10 +180,12 @@ impl DocSet for RangeDocSe #[cfg(test)] mod tests { + use std::ops::Bound; + use crate::collector::Count; use crate::directory::RamDirectory; use crate::query::RangeQuery; - use crate::{schema, IndexBuilder, TantivyDocument}; + use crate::{schema, IndexBuilder, TantivyDocument, Term}; #[test] fn range_query_fast_optional_field_minimum() { @@ -218,10 +220,9 @@ mod tests { let reader = index.reader().unwrap(); let searcher = reader.searcher(); - let query = RangeQuery::new_u64_bounds( - "score".to_string(), - std::ops::Bound::Included(70), - std::ops::Bound::Unbounded, + let query = RangeQuery::new( + Bound::Included(Term::from_field_u64(score_field, 70)), + Bound::Unbounded, ); let count = searcher.search(&query, &Count).unwrap(); diff --git a/src/query/range_query/mod.rs b/src/query/range_query/mod.rs index 75b47626fa..8ed26c95ab 100644 --- a/src/query/range_query/mod.rs +++ b/src/query/range_query/mod.rs @@ -2,13 +2,11 @@ use std::ops::Bound; use crate::schema::Type; -mod fast_field_range_query; +mod fast_field_range_doc_set; mod range_query; -mod range_query_ip_fastfield; mod range_query_u64_fastfield; pub use self::range_query::RangeQuery; -pub use self::range_query_ip_fastfield::IPFastFieldRangeWeight; pub use self::range_query_u64_fastfield::FastFieldRangeWeight; // TODO is this correct? diff --git a/src/query/range_query/range_query.rs b/src/query/range_query/range_query.rs index ac2327c7a5..4b27714c3d 100644 --- a/src/query/range_query/range_query.rs +++ b/src/query/range_query/range_query.rs @@ -1,21 +1,17 @@ use std::io; -use std::net::Ipv6Addr; -use std::ops::{Bound, Range}; +use std::ops::Bound; -use columnar::MonotonicallyMappableToU128; -use common::{BinarySerializable, BitSet}; +use common::BitSet; use super::map_bound; use super::range_query_u64_fastfield::FastFieldRangeWeight; -use crate::error::TantivyError; use crate::index::SegmentReader; use crate::query::explanation::does_not_match; -use crate::query::range_query::range_query_ip_fastfield::IPFastFieldRangeWeight; -use crate::query::range_query::{is_type_valid_for_fastfield_range_query, map_bound_res}; +use crate::query::range_query::is_type_valid_for_fastfield_range_query; use crate::query::{BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight}; use crate::schema::{Field, IndexRecordOption, Term, Type}; use crate::termdict::{TermDictionary, TermStreamer}; -use crate::{DateTime, DocId, Score}; +use crate::{DocId, Score}; /// `RangeQuery` matches all documents that have at least one term within a defined range. /// @@ -40,8 +36,10 @@ use crate::{DateTime, DocId, Score}; /// ```rust /// use tantivy::collector::Count; /// use tantivy::query::RangeQuery; +/// use tantivy::Term; /// use tantivy::schema::{Schema, INDEXED}; /// use tantivy::{doc, Index, IndexWriter}; +/// use std::ops::Bound; /// # fn test() -> tantivy::Result<()> { /// let mut schema_builder = Schema::builder(); /// let year_field = schema_builder.add_u64_field("year", INDEXED); @@ -59,7 +57,10 @@ use crate::{DateTime, DocId, Score}; /// /// let reader = index.reader()?; /// let searcher = reader.searcher(); -/// let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970); +/// let docs_in_the_sixties = RangeQuery::new( +/// Bound::Included(Term::from_field_u64(year_field, 1960)), +/// Bound::Excluded(Term::from_field_u64(year_field, 1970)), +/// ); /// let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?; /// assert_eq!(num_60s_books, 2285); /// Ok(()) @@ -68,246 +69,46 @@ use crate::{DateTime, DocId, Score}; /// ``` #[derive(Clone, Debug)] pub struct RangeQuery { - field: String, - value_type: Type, - lower_bound: Bound>, - upper_bound: Bound>, + lower_bound: Bound, + upper_bound: Bound, limit: Option, } +/// Returns the inner value of a `Bound` +pub(crate) fn inner_bound(val: &Bound) -> Option<&Term> { + match val { + Bound::Included(term) | Bound::Excluded(term) => Some(term), + Bound::Unbounded => None, + } +} + impl RangeQuery { /// Creates a new `RangeQuery` from bounded start and end terms. /// /// If the value type is not correct, something may go terribly wrong when /// the `Weight` object is created. - pub fn new_term_bounds( - field: String, - value_type: Type, - lower_bound: &Bound, - upper_bound: &Bound, - ) -> RangeQuery { - let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned(); - RangeQuery { - field, - value_type, - lower_bound: map_bound(lower_bound, verify_and_unwrap_term), - upper_bound: map_bound(upper_bound, verify_and_unwrap_term), - limit: None, - } - } - - /// Creates a new `RangeQuery` over a `i64` field. - /// - /// If the field is not of the type `i64`, tantivy - /// will panic when the `Weight` object is created. - pub fn new_i64(field: String, range: Range) -> RangeQuery { - RangeQuery::new_i64_bounds( - field, - Bound::Included(range.start), - Bound::Excluded(range.end), - ) - } - - /// Create a new `RangeQuery` over a `i64` field. - /// - /// The two `Bound` arguments make it possible to create more complex - /// ranges than semi-inclusive range. - /// - /// If the field is not of the type `i64`, tantivy - /// will panic when the `Weight` object is created. - pub fn new_i64_bounds( - field: String, - lower_bound: Bound, - upper_bound: Bound, - ) -> RangeQuery { - let make_term_val = |val: &i64| { - Term::from_field_i64(Field::from_field_id(0), *val) - .serialized_value_bytes() - .to_owned() - }; - RangeQuery { - field, - value_type: Type::I64, - lower_bound: map_bound(&lower_bound, make_term_val), - upper_bound: map_bound(&upper_bound, make_term_val), - limit: None, - } - } - - /// Creates a new `RangeQuery` over a `f64` field. - /// - /// If the field is not of the type `f64`, tantivy - /// will panic when the `Weight` object is created. - pub fn new_f64(field: String, range: Range) -> RangeQuery { - RangeQuery::new_f64_bounds( - field, - Bound::Included(range.start), - Bound::Excluded(range.end), - ) - } - - /// Create a new `RangeQuery` over a `f64` field. - /// - /// The two `Bound` arguments make it possible to create more complex - /// ranges than semi-inclusive range. - /// - /// If the field is not of the type `f64`, tantivy - /// will panic when the `Weight` object is created. - pub fn new_f64_bounds( - field: String, - lower_bound: Bound, - upper_bound: Bound, - ) -> RangeQuery { - let make_term_val = |val: &f64| { - Term::from_field_f64(Field::from_field_id(0), *val) - .serialized_value_bytes() - .to_owned() - }; - RangeQuery { - field, - value_type: Type::F64, - lower_bound: map_bound(&lower_bound, make_term_val), - upper_bound: map_bound(&upper_bound, make_term_val), - limit: None, - } - } - - /// Create a new `RangeQuery` over a `u64` field. - /// - /// The two `Bound` arguments make it possible to create more complex - /// ranges than semi-inclusive range. - /// - /// If the field is not of the type `u64`, tantivy - /// will panic when the `Weight` object is created. - pub fn new_u64_bounds( - field: String, - lower_bound: Bound, - upper_bound: Bound, - ) -> RangeQuery { - let make_term_val = |val: &u64| { - Term::from_field_u64(Field::from_field_id(0), *val) - .serialized_value_bytes() - .to_owned() - }; - RangeQuery { - field, - value_type: Type::U64, - lower_bound: map_bound(&lower_bound, make_term_val), - upper_bound: map_bound(&upper_bound, make_term_val), - limit: None, - } - } - - /// Create a new `RangeQuery` over a `ip` field. - /// - /// If the field is not of the type `ip`, tantivy - /// will panic when the `Weight` object is created. - pub fn new_ip_bounds( - field: String, - lower_bound: Bound, - upper_bound: Bound, - ) -> RangeQuery { - let make_term_val = |val: &Ipv6Addr| { - Term::from_field_ip_addr(Field::from_field_id(0), *val) - .serialized_value_bytes() - .to_owned() - }; + pub fn new(lower_bound: Bound, upper_bound: Bound) -> RangeQuery { RangeQuery { - field, - value_type: Type::IpAddr, - lower_bound: map_bound(&lower_bound, make_term_val), - upper_bound: map_bound(&upper_bound, make_term_val), + lower_bound, + upper_bound, limit: None, } } - /// Create a new `RangeQuery` over a `u64` field. - /// - /// If the field is not of the type `u64`, tantivy - /// will panic when the `Weight` object is created. - pub fn new_u64(field: String, range: Range) -> RangeQuery { - RangeQuery::new_u64_bounds( - field, - Bound::Included(range.start), - Bound::Excluded(range.end), - ) - } - - /// Create a new `RangeQuery` over a `date` field. - /// - /// The two `Bound` arguments make it possible to create more complex - /// ranges than semi-inclusive range. - /// - /// If the field is not of the type `date`, tantivy - /// will panic when the `Weight` object is created. - pub fn new_date_bounds( - field: String, - lower_bound: Bound, - upper_bound: Bound, - ) -> RangeQuery { - let make_term_val = |val: &DateTime| { - Term::from_field_date(Field::from_field_id(0), *val) - .serialized_value_bytes() - .to_owned() - }; - RangeQuery { - field, - value_type: Type::Date, - lower_bound: map_bound(&lower_bound, make_term_val), - upper_bound: map_bound(&upper_bound, make_term_val), - limit: None, - } - } - - /// Create a new `RangeQuery` over a `date` field. - /// - /// If the field is not of the type `date`, tantivy - /// will panic when the `Weight` object is created. - pub fn new_date(field: String, range: Range) -> RangeQuery { - RangeQuery::new_date_bounds( - field, - Bound::Included(range.start), - Bound::Excluded(range.end), - ) - } - - /// Create a new `RangeQuery` over a `Str` field. - /// - /// The two `Bound` arguments make it possible to create more complex - /// ranges than semi-inclusive range. - /// - /// If the field is not of the type `Str`, tantivy - /// will panic when the `Weight` object is created. - pub fn new_str_bounds( - field: String, - lower_bound: Bound<&str>, - upper_bound: Bound<&str>, - ) -> RangeQuery { - let make_term_val = |val: &&str| val.as_bytes().to_vec(); - RangeQuery { - field, - value_type: Type::Str, - lower_bound: map_bound(&lower_bound, make_term_val), - upper_bound: map_bound(&upper_bound, make_term_val), - limit: None, - } + /// Field to search over + pub fn field(&self) -> Field { + self.get_term().field() } - /// Create a new `RangeQuery` over a `Str` field. - /// - /// If the field is not of the type `Str`, tantivy - /// will panic when the `Weight` object is created. - pub fn new_str(field: String, range: Range<&str>) -> RangeQuery { - RangeQuery::new_str_bounds( - field, - Bound::Included(range.start), - Bound::Excluded(range.end), - ) + /// The value type of the field + pub fn value_type(&self) -> Type { + self.get_term().typ() } - /// Field to search over - pub fn field(&self) -> &str { - &self.field + pub(crate) fn get_term(&self) -> &Term { + inner_bound(&self.lower_bound) + .or(inner_bound(&self.upper_bound)) + .expect("At least one bound must be set") } /// Limit the number of term the `RangeQuery` will go through. @@ -319,70 +120,23 @@ impl RangeQuery { } } -/// Returns true if the type maps to a u64 fast field -pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool { - match typ { - Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true, - Type::IpAddr => false, - Type::Str | Type::Facet | Type::Bytes | Type::Json => false, - } -} - impl Query for RangeQuery { fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result> { let schema = enable_scoring.schema(); - let field_type = schema - .get_field_entry(schema.get_field(&self.field)?) - .field_type(); - let value_type = field_type.value_type(); - if value_type != self.value_type { - let err_msg = format!( - "Create a range query of the type {:?}, when the field given was of type \ - {value_type:?}", - self.value_type - ); - return Err(TantivyError::SchemaError(err_msg)); - } - - if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type) { - if field_type.is_ip_addr() { - let parse_ip_from_bytes = |data: &Vec| { - let ip_u128_bytes: [u8; 16] = data.as_slice().try_into().map_err(|_| { - crate::TantivyError::InvalidArgument( - "Expected 8 bytes for ip address".to_string(), - ) - })?; - let ip_u128 = u128::from_be_bytes(ip_u128_bytes); - crate::Result::::Ok(Ipv6Addr::from_u128(ip_u128)) - }; - let lower_bound = map_bound_res(&self.lower_bound, parse_ip_from_bytes)?; - let upper_bound = map_bound_res(&self.upper_bound, parse_ip_from_bytes)?; - Ok(Box::new(IPFastFieldRangeWeight::new( - self.field.to_string(), - lower_bound, - upper_bound, - ))) - } else { - // We run the range query on u64 value space for performance reasons and simpicity - // assert the type maps to u64 - assert!(maps_to_u64_fastfield(self.value_type)); - let parse_from_bytes = |data: &Vec| { - u64::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap()) - }; - - let lower_bound = map_bound(&self.lower_bound, parse_from_bytes); - let upper_bound = map_bound(&self.upper_bound, parse_from_bytes); - Ok(Box::new(FastFieldRangeWeight::new_u64_lenient( - self.field.to_string(), - lower_bound, - upper_bound, - ))) - } + let field_type = schema.get_field_entry(self.field()).field_type(); + + if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type()) { + Ok(Box::new(FastFieldRangeWeight::new( + self.field(), + self.lower_bound.clone(), + self.upper_bound.clone(), + ))) } else { + let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned(); Ok(Box::new(RangeWeight { - field: self.field.to_string(), - lower_bound: self.lower_bound.clone(), - upper_bound: self.upper_bound.clone(), + field: self.field(), + lower_bound: map_bound(&self.lower_bound, verify_and_unwrap_term), + upper_bound: map_bound(&self.upper_bound, verify_and_unwrap_term), limit: self.limit, })) } @@ -390,7 +144,7 @@ impl Query for RangeQuery { } pub struct RangeWeight { - field: String, + field: Field, lower_bound: Bound>, upper_bound: Bound>, limit: Option, @@ -423,7 +177,7 @@ impl Weight for RangeWeight { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); - let inverted_index = reader.inverted_index(reader.schema().get_field(&self.field)?)?; + let inverted_index = reader.inverted_index(self.field)?; let term_dict = inverted_index.terms(); let mut term_range = self.term_range(term_dict)?; let mut processed_count = 0; @@ -477,7 +231,7 @@ mod tests { use crate::schema::{ Field, IntoIpv6Addr, Schema, TantivyDocument, FAST, INDEXED, STORED, TEXT, }; - use crate::{Index, IndexWriter}; + use crate::{Index, IndexWriter, Term}; #[test] fn test_range_query_simple() -> crate::Result<()> { @@ -499,7 +253,10 @@ mod tests { let reader = index.reader()?; let searcher = reader.searcher(); - let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960u64..1970u64); + let docs_in_the_sixties = RangeQuery::new( + Bound::Included(Term::from_field_u64(year_field, 1960)), + Bound::Excluded(Term::from_field_u64(year_field, 1970)), + ); // ... or `1960..=1969` if inclusive range is enabled. let count = searcher.search(&docs_in_the_sixties, &Count)?; @@ -530,7 +287,10 @@ mod tests { let reader = index.reader()?; let searcher = reader.searcher(); - let mut docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960u64..1970u64); + let mut docs_in_the_sixties = RangeQuery::new( + Bound::Included(Term::from_field_u64(year_field, 1960)), + Bound::Excluded(Term::from_field_u64(year_field, 1970)), + ); docs_in_the_sixties.limit(5); // due to the limit and no docs in 1963, it's really only 1960..=1965 @@ -575,29 +335,29 @@ mod tests { |range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap(); assert_eq!( - count_multiples(RangeQuery::new_i64("intfield".to_string(), 10..11)), + count_multiples(RangeQuery::new( + Bound::Included(Term::from_field_i64(int_field, 10)), + Bound::Excluded(Term::from_field_i64(int_field, 11)), + )), 9 ); assert_eq!( - count_multiples(RangeQuery::new_i64_bounds( - "intfield".to_string(), - Bound::Included(10), - Bound::Included(11) + count_multiples(RangeQuery::new( + Bound::Included(Term::from_field_i64(int_field, 10)), + Bound::Included(Term::from_field_i64(int_field, 11)), )), 18 ); assert_eq!( - count_multiples(RangeQuery::new_i64_bounds( - "intfield".to_string(), - Bound::Excluded(9), - Bound::Included(10) + count_multiples(RangeQuery::new( + Bound::Excluded(Term::from_field_i64(int_field, 9)), + Bound::Included(Term::from_field_i64(int_field, 10)), )), 9 ); assert_eq!( - count_multiples(RangeQuery::new_i64_bounds( - "intfield".to_string(), - Bound::Included(9), + count_multiples(RangeQuery::new( + Bound::Included(Term::from_field_i64(int_field, 9)), Bound::Unbounded )), 91 @@ -646,29 +406,29 @@ mod tests { |range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap(); assert_eq!( - count_multiples(RangeQuery::new_f64("floatfield".to_string(), 10.0..11.0)), + count_multiples(RangeQuery::new( + Bound::Included(Term::from_field_f64(float_field, 10.0)), + Bound::Excluded(Term::from_field_f64(float_field, 11.0)), + )), 9 ); assert_eq!( - count_multiples(RangeQuery::new_f64_bounds( - "floatfield".to_string(), - Bound::Included(10.0), - Bound::Included(11.0) + count_multiples(RangeQuery::new( + Bound::Included(Term::from_field_f64(float_field, 10.0)), + Bound::Included(Term::from_field_f64(float_field, 11.0)), )), 18 ); assert_eq!( - count_multiples(RangeQuery::new_f64_bounds( - "floatfield".to_string(), - Bound::Excluded(9.0), - Bound::Included(10.0) + count_multiples(RangeQuery::new( + Bound::Excluded(Term::from_field_f64(float_field, 9.0)), + Bound::Included(Term::from_field_f64(float_field, 10.0)), )), 9 ); assert_eq!( - count_multiples(RangeQuery::new_f64_bounds( - "floatfield".to_string(), - Bound::Included(9.0), + count_multiples(RangeQuery::new( + Bound::Included(Term::from_field_f64(float_field, 9.0)), Bound::Unbounded )), 91 diff --git a/src/query/range_query/range_query_ip_fastfield.rs b/src/query/range_query/range_query_ip_fastfield.rs deleted file mode 100644 index 97f0cdb22e..0000000000 --- a/src/query/range_query/range_query_ip_fastfield.rs +++ /dev/null @@ -1,512 +0,0 @@ -//! IP Fastfields support efficient scanning for range queries. -//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is -//! used, which uses the term dictionary + postings. - -use std::net::Ipv6Addr; -use std::ops::{Bound, RangeInclusive}; - -use columnar::{Column, MonotonicallyMappableToU128}; - -use crate::query::range_query::fast_field_range_query::RangeDocSet; -use crate::query::{ConstScorer, EmptyScorer, Explanation, Scorer, Weight}; -use crate::{DocId, DocSet, Score, SegmentReader, TantivyError}; - -/// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries. -pub struct IPFastFieldRangeWeight { - field: String, - lower_bound: Bound, - upper_bound: Bound, -} - -impl IPFastFieldRangeWeight { - /// Creates a new IPFastFieldRangeWeight. - pub fn new(field: String, lower_bound: Bound, upper_bound: Bound) -> Self { - Self { - field, - lower_bound, - upper_bound, - } - } -} - -impl Weight for IPFastFieldRangeWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { - let Some(ip_addr_column): Option> = - reader.fast_fields().column_opt(&self.field)? - else { - return Ok(Box::new(EmptyScorer)); - }; - let value_range = bound_to_value_range( - &self.lower_bound, - &self.upper_bound, - ip_addr_column.min_value(), - ip_addr_column.max_value(), - ); - let docset = RangeDocSet::new(value_range, ip_addr_column); - Ok(Box::new(ConstScorer::new(docset, boost))) - } - - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { - let mut scorer = self.scorer(reader, 1.0)?; - if scorer.seek(doc) != doc { - return Err(TantivyError::InvalidArgument(format!( - "Document #({doc}) does not match" - ))); - } - let explanation = Explanation::new("Const", scorer.score()); - Ok(explanation) - } -} - -fn bound_to_value_range( - lower_bound: &Bound, - upper_bound: &Bound, - min_value: Ipv6Addr, - max_value: Ipv6Addr, -) -> RangeInclusive { - let start_value = match lower_bound { - Bound::Included(ip_addr) => *ip_addr, - Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() + 1), - Bound::Unbounded => min_value, - }; - - let end_value = match upper_bound { - Bound::Included(ip_addr) => *ip_addr, - Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() - 1), - Bound::Unbounded => max_value, - }; - start_value..=end_value -} - -#[cfg(test)] -pub mod tests { - use proptest::prelude::ProptestConfig; - use proptest::strategy::Strategy; - use proptest::{prop_oneof, proptest}; - - use super::*; - use crate::collector::Count; - use crate::query::QueryParser; - use crate::schema::{Schema, FAST, INDEXED, STORED, STRING}; - use crate::{Index, IndexWriter}; - - #[derive(Clone, Debug)] - pub struct Doc { - pub id: String, - pub ip: Ipv6Addr, - } - - fn operation_strategy() -> impl Strategy { - prop_oneof![ - (0u64..10_000u64).prop_map(doc_from_id_1), - (1u64..10_000u64).prop_map(doc_from_id_2), - ] - } - - pub fn doc_from_id_1(id: u64) -> Doc { - let id = id * 1000; - Doc { - // ip != id - id: id.to_string(), - ip: Ipv6Addr::from_u128(id as u128), - } - } - fn doc_from_id_2(id: u64) -> Doc { - let id = id * 1000; - Doc { - // ip != id - id: (id - 1).to_string(), - ip: Ipv6Addr::from_u128(id as u128), - } - } - - proptest! { - #![proptest_config(ProptestConfig::with_cases(10))] - #[test] - fn test_ip_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) { - assert!(test_ip_range_for_docs(&ops).is_ok()); - } - } - - #[test] - fn test_ip_range_regression1() { - let ops = &[doc_from_id_1(0)]; - assert!(test_ip_range_for_docs(ops).is_ok()); - } - - #[test] - fn test_ip_range_regression2() { - let ops = &[ - doc_from_id_1(52), - doc_from_id_1(63), - doc_from_id_1(12), - doc_from_id_2(91), - doc_from_id_2(33), - ]; - assert!(test_ip_range_for_docs(ops).is_ok()); - } - - #[test] - fn test_ip_range_regression3() { - let ops = &[doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)]; - assert!(test_ip_range_for_docs(ops).is_ok()); - } - - #[test] - fn test_ip_range_regression3_simple() { - let mut schema_builder = Schema::builder(); - let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED); - let schema = schema_builder.build(); - let index = Index::create_in_ram(schema); - let mut writer: IndexWriter = index.writer_for_tests().unwrap(); - let ip_addrs: Vec = [1000, 2000, 3000] - .into_iter() - .map(Ipv6Addr::from_u128) - .collect(); - for &ip_addr in &ip_addrs { - writer - .add_document(doc!(ips_field=>ip_addr, ips_field=>ip_addr)) - .unwrap(); - } - writer.commit().unwrap(); - let searcher = index.reader().unwrap().searcher(); - let range_weight = IPFastFieldRangeWeight { - field: "ips".to_string(), - lower_bound: Bound::Included(ip_addrs[1]), - upper_bound: Bound::Included(ip_addrs[2]), - }; - let count = range_weight.count(searcher.segment_reader(0)).unwrap(); - assert_eq!(count, 2); - } - - pub fn create_index_from_docs(docs: &[Doc]) -> Index { - let mut schema_builder = Schema::builder(); - let ip_field = schema_builder.add_ip_addr_field("ip", STORED | FAST); - let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED); - let text_field = schema_builder.add_text_field("id", STRING | STORED); - let schema = schema_builder.build(); - let index = Index::create_in_ram(schema); - - { - let mut index_writer = index.writer_with_num_threads(2, 60_000_000).unwrap(); - for doc in docs.iter() { - index_writer - .add_document(doc!( - ips_field => doc.ip, - ips_field => doc.ip, - ip_field => doc.ip, - text_field => doc.id.to_string(), - )) - .unwrap(); - } - - index_writer.commit().unwrap(); - } - index - } - - fn test_ip_range_for_docs(docs: &[Doc]) -> crate::Result<()> { - let index = create_index_from_docs(docs); - let reader = index.reader().unwrap(); - let searcher = reader.searcher(); - - let get_num_hits = |query| searcher.search(&query, &Count).unwrap(); - let query_from_text = |text: &str| { - QueryParser::for_index(&index, vec![]) - .parse_query(text) - .unwrap() - }; - - let gen_query_inclusive = |field: &str, ip_range: &RangeInclusive| { - format!("{field}:[{} TO {}]", ip_range.start(), ip_range.end()) - }; - - let test_sample = |sample_docs: &[Doc]| { - let mut ips: Vec = sample_docs.iter().map(|doc| doc.ip).collect(); - ips.sort(); - let ip_range = ips[0]..=ips[1]; - let expected_num_hits = docs - .iter() - .filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip)) - .count(); - - let query = gen_query_inclusive("ip", &ip_range); - assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); - - let query = gen_query_inclusive("ips", &ip_range); - assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); - - // Intersection search - let id_filter = sample_docs[0].id.to_string(); - let expected_num_hits = docs - .iter() - .filter(|doc| ip_range.contains(&doc.ip) && doc.id == id_filter) - .count(); - let query = format!( - "{} AND id:{}", - gen_query_inclusive("ip", &ip_range), - &id_filter - ); - assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); - - // Intersection search on multivalue ip field - let id_filter = sample_docs[0].id.to_string(); - let query = format!( - "{} AND id:{}", - gen_query_inclusive("ips", &ip_range), - &id_filter - ); - assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); - }; - - test_sample(&[docs[0].clone(), docs[0].clone()]); - if docs.len() > 1 { - test_sample(&[docs[0].clone(), docs[1].clone()]); - test_sample(&[docs[1].clone(), docs[1].clone()]); - } - if docs.len() > 2 { - test_sample(&[docs[1].clone(), docs[2].clone()]); - } - - Ok(()) - } -} - -#[cfg(all(test, feature = "unstable"))] -mod bench { - - use rand::rngs::StdRng; - use rand::{Rng, SeedableRng}; - use test::Bencher; - - use super::tests::*; - use super::*; - use crate::collector::Count; - use crate::query::QueryParser; - use crate::Index; - - fn get_index_0_to_100() -> Index { - let mut rng = StdRng::from_seed([1u8; 32]); - let num_vals = 100_000; - let docs: Vec<_> = (0..num_vals) - .map(|_i| { - let id = if rng.gen_bool(0.01) { - "veryfew".to_string() // 1% - } else if rng.gen_bool(0.1) { - "few".to_string() // 9% - } else { - "many".to_string() // 90% - }; - Doc { - id, - // Multiply by 1000, so that we create many buckets in the compact space - // The benches depend on this range to select n-percent of elements with the - // methods below. - ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000), - } - }) - .collect(); - - create_index_from_docs(&docs) - } - - fn get_90_percent() -> RangeInclusive { - let start = Ipv6Addr::from_u128(0); - let end = Ipv6Addr::from_u128(90 * 1000); - start..=end - } - - fn get_10_percent() -> RangeInclusive { - let start = Ipv6Addr::from_u128(0); - let end = Ipv6Addr::from_u128(10 * 1000); - start..=end - } - - fn get_1_percent() -> RangeInclusive { - let start = Ipv6Addr::from_u128(10 * 1000); - let end = Ipv6Addr::from_u128(10 * 1000); - start..=end - } - - fn excute_query( - field: &str, - ip_range: RangeInclusive, - suffix: &str, - index: &Index, - ) -> usize { - let gen_query_inclusive = |from: &Ipv6Addr, to: &Ipv6Addr| { - format!( - "{}:[{} TO {}] {}", - field, - &from.to_string(), - &to.to_string(), - suffix - ) - }; - - let query = gen_query_inclusive(ip_range.start(), ip_range.end()); - let query_from_text = |text: &str| { - QueryParser::for_index(index, vec![]) - .parse_query(text) - .unwrap() - }; - let query = query_from_text(&query); - let reader = index.reader().unwrap(); - let searcher = reader.searcher(); - searcher.search(&query, &(Count)).unwrap() - } - - #[bench] - fn bench_ip_range_hit_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ip", get_90_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ip", get_10_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ip", get_1_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ip", get_10_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ip", get_1_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ip", get_1_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ip", get_1_percent(), "AND id:veryfew", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ip", get_10_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ip", get_90_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ip", get_90_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ip", get_90_percent(), "AND id:veryfew", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ips", get_90_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ips", get_10_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ips", get_1_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ips", get_10_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ips", get_1_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ips", get_1_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ips", get_1_percent(), "AND id:veryfew", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ips", get_10_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ips", get_90_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ips", get_90_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| excute_query("ips", get_90_percent(), "AND id:veryfew", &index)); - } -} diff --git a/src/query/range_query/range_query_u64_fastfield.rs b/src/query/range_query/range_query_u64_fastfield.rs index 077b8dcb3e..1db436ccb1 100644 --- a/src/query/range_query/range_query_u64_fastfield.rs +++ b/src/query/range_query/range_query_u64_fastfield.rs @@ -2,54 +2,34 @@ //! We use this variant only if the fastfield exists, otherwise the default in `range_query` is //! used, which uses the term dictionary + postings. +use std::net::Ipv6Addr; use std::ops::{Bound, RangeInclusive}; -use columnar::{ColumnType, HasAssociatedColumnType, MonotonicallyMappableToU64}; +use columnar::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64}; +use common::BinarySerializable; -use super::fast_field_range_query::RangeDocSet; -use super::map_bound; -use crate::query::{ConstScorer, EmptyScorer, Explanation, Query, Scorer, Weight}; -use crate::{DocId, DocSet, Score, SegmentReader, TantivyError}; +use super::fast_field_range_doc_set::RangeDocSet; +use super::{map_bound, map_bound_res}; +use crate::query::range_query::range_query::inner_bound; +use crate::query::{AllScorer, ConstScorer, EmptyScorer, Explanation, Query, Scorer, Weight}; +use crate::schema::{Field, Type}; +use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term}; /// `FastFieldRangeWeight` uses the fast field to execute range queries. #[derive(Clone, Debug)] pub struct FastFieldRangeWeight { - field: String, - lower_bound: Bound, - upper_bound: Bound, - column_type_opt: Option, + lower_bound: Bound, + upper_bound: Bound, + field: Field, } impl FastFieldRangeWeight { - /// Create a new FastFieldRangeWeight, using the u64 representation of any fast field. - pub(crate) fn new_u64_lenient( - field: String, - lower_bound: Bound, - upper_bound: Bound, - ) -> Self { - let lower_bound = map_bound(&lower_bound, |val| *val); - let upper_bound = map_bound(&upper_bound, |val| *val); + /// Create a new FastFieldRangeWeight + pub(crate) fn new(field: Field, lower_bound: Bound, upper_bound: Bound) -> Self { Self { - field, lower_bound, upper_bound, - column_type_opt: None, - } - } - - /// Create a new `FastFieldRangeWeight` for a range of a u64-mappable type . - pub fn new( - field: String, - lower_bound: Bound, - upper_bound: Bound, - ) -> Self { - let lower_bound = map_bound(&lower_bound, |val| val.to_u64()); - let upper_bound = map_bound(&upper_bound, |val| val.to_u64()); - Self { field, - lower_bound, - upper_bound, - column_type_opt: Some(T::column_type()), } } } @@ -65,30 +45,86 @@ impl Query for FastFieldRangeWeight { impl Weight for FastFieldRangeWeight { fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { - let fast_field_reader = reader.fast_fields(); - let column_type_opt: Option<[ColumnType; 1]> = - self.column_type_opt.map(|column_type| [column_type]); - let column_type_opt_ref: Option<&[ColumnType]> = column_type_opt - .as_ref() - .map(|column_types| column_types.as_slice()); - let Some((column, _)) = - fast_field_reader.u64_lenient_for_type(column_type_opt_ref, &self.field)? - else { - return Ok(Box::new(EmptyScorer)); - }; - #[allow(clippy::reversed_empty_ranges)] - let value_range = bound_to_value_range( - &self.lower_bound, - &self.upper_bound, - column.min_value(), - column.max_value(), - ) - .unwrap_or(1..=0); // empty range - if value_range.is_empty() { - return Ok(Box::new(EmptyScorer)); + // Check if both bounds are Bound::Unbounded + if self.lower_bound == Bound::Unbounded && self.upper_bound == Bound::Unbounded { + return Ok(Box::new(AllScorer::new(reader.max_doc()))); + } + let field_name = reader.schema().get_field_name(self.field); + let field_type = reader.schema().get_field_entry(self.field).field_type(); + if field_type.is_ip_addr() { + let parse_ip_from_bytes = |term: &Term| { + let ip_u128_bytes: [u8; 16] = + term.serialized_value_bytes().try_into().map_err(|_| { + crate::TantivyError::InvalidArgument( + "Expected 8 bytes for ip address".to_string(), + ) + })?; + let ip_u128 = u128::from_be_bytes(ip_u128_bytes); + crate::Result::::Ok(Ipv6Addr::from_u128(ip_u128)) + }; + let lower_bound = map_bound_res(&self.lower_bound, parse_ip_from_bytes)?; + let upper_bound = map_bound_res(&self.upper_bound, parse_ip_from_bytes)?; + + let Some(ip_addr_column): Option> = + reader.fast_fields().column_opt(field_name)? + else { + return Ok(Box::new(EmptyScorer)); + }; + let value_range = bound_to_value_range_ip( + &lower_bound, + &upper_bound, + ip_addr_column.min_value(), + ip_addr_column.max_value(), + ); + let docset = RangeDocSet::new(value_range, ip_addr_column); + Ok(Box::new(ConstScorer::new(docset, boost))) + } else { + assert!( + maps_to_u64_fastfield(field_type.value_type()), + "{:?}", + field_type + ); + + let term = inner_bound(&self.lower_bound) + .or(inner_bound(&self.upper_bound)) + .expect("At least one bound must be set"); + assert_eq!( + term.typ(), + field_type.value_type(), + "Field is of type {:?}, but got term of type {:?}", + field_type, + term.typ() + ); + + let parse_from_bytes = |term: &Term| { + u64::from_be( + BinarySerializable::deserialize(&mut &term.serialized_value_bytes()[..]) + .unwrap(), + ) + }; + + let lower_bound = map_bound(&self.lower_bound, parse_from_bytes); + let upper_bound = map_bound(&self.upper_bound, parse_from_bytes); + + let fast_field_reader = reader.fast_fields(); + let Some((column, _)) = fast_field_reader.u64_lenient_for_type(None, field_name)? + else { + return Ok(Box::new(EmptyScorer)); + }; + #[allow(clippy::reversed_empty_ranges)] + let value_range = bound_to_value_range( + &lower_bound, + &upper_bound, + column.min_value(), + column.max_value(), + ) + .unwrap_or(1..=0); // empty range + if value_range.is_empty() { + return Ok(Box::new(EmptyScorer)); + } + let docset = RangeDocSet::new(value_range, column); + Ok(Box::new(ConstScorer::new(docset, boost))) } - let docset = RangeDocSet::new(value_range, column); - Ok(Box::new(ConstScorer::new(docset, boost))) } fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { @@ -104,6 +140,35 @@ impl Weight for FastFieldRangeWeight { } } +/// Returns true if the type maps to a u64 fast field +pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool { + match typ { + Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true, + Type::IpAddr => false, + Type::Str | Type::Facet | Type::Bytes | Type::Json => false, + } +} + +fn bound_to_value_range_ip( + lower_bound: &Bound, + upper_bound: &Bound, + min_value: Ipv6Addr, + max_value: Ipv6Addr, +) -> RangeInclusive { + let start_value = match lower_bound { + Bound::Included(ip_addr) => *ip_addr, + Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() + 1), + Bound::Unbounded => min_value, + }; + + let end_value = match upper_bound { + Bound::Included(ip_addr) => *ip_addr, + Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() - 1), + Bound::Unbounded => max_value, + }; + start_value..=end_value +} + // Returns None, if the range cannot be converted to a inclusive range (which equals to a empty // range). fn bound_to_value_range( @@ -141,7 +206,7 @@ pub mod tests { use crate::query::range_query::range_query_u64_fastfield::FastFieldRangeWeight; use crate::query::{QueryParser, Weight}; use crate::schema::{NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING}; - use crate::{Index, IndexWriter, TERMINATED}; + use crate::{Index, IndexWriter, Term, TERMINATED}; #[derive(Clone, Debug)] pub struct Doc { @@ -213,10 +278,10 @@ pub mod tests { writer.add_document(doc!(field=>52_000u64)).unwrap(); writer.commit().unwrap(); let searcher = index.reader().unwrap().searcher(); - let range_query = FastFieldRangeWeight::new_u64_lenient( - "test_field".to_string(), - Bound::Included(50_000), - Bound::Included(50_002), + let range_query = FastFieldRangeWeight::new( + field, + Bound::Included(Term::from_field_u64(field, 50_000)), + Bound::Included(Term::from_field_u64(field, 50_002)), ); let scorer = range_query .scorer(searcher.segment_reader(0), 1.0f32) @@ -394,6 +459,202 @@ pub mod tests { } } +#[cfg(test)] +pub mod ip_range_tests { + use proptest::prelude::ProptestConfig; + use proptest::strategy::Strategy; + use proptest::{prop_oneof, proptest}; + + use super::*; + use crate::collector::Count; + use crate::query::QueryParser; + use crate::schema::{Schema, FAST, INDEXED, STORED, STRING}; + use crate::{Index, IndexWriter}; + + #[derive(Clone, Debug)] + pub struct Doc { + pub id: String, + pub ip: Ipv6Addr, + } + + fn operation_strategy() -> impl Strategy { + prop_oneof![ + (0u64..10_000u64).prop_map(doc_from_id_1), + (1u64..10_000u64).prop_map(doc_from_id_2), + ] + } + + pub fn doc_from_id_1(id: u64) -> Doc { + let id = id * 1000; + Doc { + // ip != id + id: id.to_string(), + ip: Ipv6Addr::from_u128(id as u128), + } + } + fn doc_from_id_2(id: u64) -> Doc { + let id = id * 1000; + Doc { + // ip != id + id: (id - 1).to_string(), + ip: Ipv6Addr::from_u128(id as u128), + } + } + + proptest! { + #![proptest_config(ProptestConfig::with_cases(10))] + #[test] + fn test_ip_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) { + assert!(test_ip_range_for_docs(&ops).is_ok()); + } + } + + #[test] + fn test_ip_range_regression1() { + let ops = &[doc_from_id_1(0)]; + assert!(test_ip_range_for_docs(ops).is_ok()); + } + + #[test] + fn test_ip_range_regression2() { + let ops = &[ + doc_from_id_1(52), + doc_from_id_1(63), + doc_from_id_1(12), + doc_from_id_2(91), + doc_from_id_2(33), + ]; + assert!(test_ip_range_for_docs(ops).is_ok()); + } + + #[test] + fn test_ip_range_regression3() { + let ops = &[doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)]; + assert!(test_ip_range_for_docs(ops).is_ok()); + } + + #[test] + fn test_ip_range_regression3_simple() { + let mut schema_builder = Schema::builder(); + let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut writer: IndexWriter = index.writer_for_tests().unwrap(); + let ip_addrs: Vec = [1000, 2000, 3000] + .into_iter() + .map(Ipv6Addr::from_u128) + .collect(); + for &ip_addr in &ip_addrs { + writer + .add_document(doc!(ips_field=>ip_addr, ips_field=>ip_addr)) + .unwrap(); + } + writer.commit().unwrap(); + let searcher = index.reader().unwrap().searcher(); + let range_weight = FastFieldRangeWeight::new( + ips_field, + Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[1])), + Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[2])), + ); + + let count = + crate::query::weight::Weight::count(&range_weight, searcher.segment_reader(0)).unwrap(); + assert_eq!(count, 2); + } + + pub fn create_index_from_ip_docs(docs: &[Doc]) -> Index { + let mut schema_builder = Schema::builder(); + let ip_field = schema_builder.add_ip_addr_field("ip", STORED | FAST); + let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED); + let text_field = schema_builder.add_text_field("id", STRING | STORED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + + { + let mut index_writer = index.writer_with_num_threads(2, 60_000_000).unwrap(); + for doc in docs.iter() { + index_writer + .add_document(doc!( + ips_field => doc.ip, + ips_field => doc.ip, + ip_field => doc.ip, + text_field => doc.id.to_string(), + )) + .unwrap(); + } + + index_writer.commit().unwrap(); + } + index + } + + fn test_ip_range_for_docs(docs: &[Doc]) -> crate::Result<()> { + let index = create_index_from_ip_docs(docs); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + + let get_num_hits = |query| searcher.search(&query, &Count).unwrap(); + let query_from_text = |text: &str| { + QueryParser::for_index(&index, vec![]) + .parse_query(text) + .unwrap() + }; + + let gen_query_inclusive = |field: &str, ip_range: &RangeInclusive| { + format!("{field}:[{} TO {}]", ip_range.start(), ip_range.end()) + }; + + let test_sample = |sample_docs: &[Doc]| { + let mut ips: Vec = sample_docs.iter().map(|doc| doc.ip).collect(); + ips.sort(); + let ip_range = ips[0]..=ips[1]; + let expected_num_hits = docs + .iter() + .filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip)) + .count(); + + let query = gen_query_inclusive("ip", &ip_range); + assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); + + let query = gen_query_inclusive("ips", &ip_range); + assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); + + // Intersection search + let id_filter = sample_docs[0].id.to_string(); + let expected_num_hits = docs + .iter() + .filter(|doc| ip_range.contains(&doc.ip) && doc.id == id_filter) + .count(); + let query = format!( + "{} AND id:{}", + gen_query_inclusive("ip", &ip_range), + &id_filter + ); + assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); + + // Intersection search on multivalue ip field + let id_filter = sample_docs[0].id.to_string(); + let query = format!( + "{} AND id:{}", + gen_query_inclusive("ips", &ip_range), + &id_filter + ); + assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); + }; + + test_sample(&[docs[0].clone(), docs[0].clone()]); + if docs.len() > 1 { + test_sample(&[docs[0].clone(), docs[1].clone()]); + test_sample(&[docs[1].clone(), docs[1].clone()]); + } + if docs.len() > 2 { + test_sample(&[docs[1].clone(), docs[2].clone()]); + } + + Ok(()) + } +} + #[cfg(all(test, feature = "unstable"))] mod bench { @@ -601,3 +862,242 @@ mod bench { bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:veryfew", &index)); } } + +#[cfg(all(test, feature = "unstable"))] +mod bench_ip { + + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; + use test::Bencher; + + use super::ip_range_tests::*; + use super::*; + use crate::collector::Count; + use crate::query::QueryParser; + use crate::Index; + + fn get_index_0_to_100() -> Index { + let mut rng = StdRng::from_seed([1u8; 32]); + let num_vals = 100_000; + let docs: Vec<_> = (0..num_vals) + .map(|_i| { + let id = if rng.gen_bool(0.01) { + "veryfew".to_string() // 1% + } else if rng.gen_bool(0.1) { + "few".to_string() // 9% + } else { + "many".to_string() // 90% + }; + Doc { + id, + // Multiply by 1000, so that we create many buckets in the compact space + // The benches depend on this range to select n-percent of elements with the + // methods below. + ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000), + } + }) + .collect(); + + create_index_from_ip_docs(&docs) + } + + fn get_90_percent() -> RangeInclusive { + let start = Ipv6Addr::from_u128(0); + let end = Ipv6Addr::from_u128(90 * 1000); + start..=end + } + + fn get_10_percent() -> RangeInclusive { + let start = Ipv6Addr::from_u128(0); + let end = Ipv6Addr::from_u128(10 * 1000); + start..=end + } + + fn get_1_percent() -> RangeInclusive { + let start = Ipv6Addr::from_u128(10 * 1000); + let end = Ipv6Addr::from_u128(10 * 1000); + start..=end + } + + fn excute_query( + field: &str, + ip_range: RangeInclusive, + suffix: &str, + index: &Index, + ) -> usize { + let gen_query_inclusive = |from: &Ipv6Addr, to: &Ipv6Addr| { + format!( + "{}:[{} TO {}] {}", + field, + &from.to_string(), + &to.to_string(), + suffix + ) + }; + + let query = gen_query_inclusive(ip_range.start(), ip_range.end()); + let query_from_text = |text: &str| { + QueryParser::for_index(index, vec![]) + .parse_query(text) + .unwrap() + }; + let query = query_from_text(&query); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + searcher.search(&query, &(Count)).unwrap() + } + + #[bench] + fn bench_ip_range_hit_90_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ip", get_90_percent(), "", &index)); + } + + #[bench] + fn bench_ip_range_hit_10_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ip", get_10_percent(), "", &index)); + } + + #[bench] + fn bench_ip_range_hit_1_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ip", get_1_percent(), "", &index)); + } + + #[bench] + fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ip", get_10_percent(), "AND id:few", &index)); + } + + #[bench] + fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ip", get_1_percent(), "AND id:few", &index)); + } + + #[bench] + fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ip", get_1_percent(), "AND id:many", &index)); + } + + #[bench] + fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ip", get_1_percent(), "AND id:veryfew", &index)); + } + + #[bench] + fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ip", get_10_percent(), "AND id:many", &index)); + } + + #[bench] + fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ip", get_90_percent(), "AND id:many", &index)); + } + + #[bench] + fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ip", get_90_percent(), "AND id:few", &index)); + } + + #[bench] + fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ip", get_90_percent(), "AND id:veryfew", &index)); + } + + #[bench] + fn bench_ip_range_hit_90_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ips", get_90_percent(), "", &index)); + } + + #[bench] + fn bench_ip_range_hit_10_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ips", get_10_percent(), "", &index)); + } + + #[bench] + fn bench_ip_range_hit_1_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ips", get_1_percent(), "", &index)); + } + + #[bench] + fn bench_ip_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ips", get_10_percent(), "AND id:few", &index)); + } + + #[bench] + fn bench_ip_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ips", get_1_percent(), "AND id:few", &index)); + } + + #[bench] + fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ips", get_1_percent(), "AND id:many", &index)); + } + + #[bench] + fn bench_ip_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ips", get_1_percent(), "AND id:veryfew", &index)); + } + + #[bench] + fn bench_ip_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ips", get_10_percent(), "AND id:many", &index)); + } + + #[bench] + fn bench_ip_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ips", get_90_percent(), "AND id:many", &index)); + } + + #[bench] + fn bench_ip_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ips", get_90_percent(), "AND id:few", &index)); + } + + #[bench] + fn bench_ip_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ips", get_90_percent(), "AND id:veryfew", &index)); + } +}