diff --git a/lucene/core/src/java/org/apache/lucene/document/package-info.java b/lucene/core/src/java/org/apache/lucene/document/package-info.java index 4b70985be118..f703d6c1ecbe 100644 --- a/lucene/core/src/java/org/apache/lucene/document/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/document/package-info.java @@ -46,10 +46,61 @@ * make the process of taking a file and converting it into a Lucene {@link * org.apache.lucene.document.Document}. * - *
The {@link org.apache.lucene.document.DateTools} is a utility class to make dates and times - * searchable. {@link org.apache.lucene.document.IntPoint}, {@link - * org.apache.lucene.document.LongPoint}, {@link org.apache.lucene.document.FloatPoint} and {@link - * org.apache.lucene.document.DoublePoint} enable indexing of numeric values (and also dates) for - * fast range queries using {@link org.apache.lucene.search.PointRangeQuery} + *
{@link org.apache.lucene.document.TextField} allows indexing tokens from a String so that one + * can perform full-text search on it. The way that the input is tokenized depends on the {@link + * org.apache.lucene.analysis.Analyzer} that is configured on the {@link + * org.apache.lucene.index.IndexWriterConfig}. TextField can also be optionally stored. + * + *
{@link org.apache.lucene.document.KeywordField} indexes whole values as a single term so that + * one can perform exact search on it. It also records doc values to enable sorting or faceting on + * this field. Finally, it also supports optionally storing the value. + * + *
If faceting or sorting are not required, {@link org.apache.lucene.document.StringField} is a + * variant of {@link org.apache.lucene.document.KeywordField} that does not index doc values. + * + *
If a numeric field represents an identifier rather than a quantity and is more commonly + * searched on single values than on ranges of values, it is generally recommended to index its + * string representation via {@link org.apache.lucene.document.KeywordField} (or {@link + * org.apache.lucene.document.StringField} if doc values are not necessary). + * + *
{@link org.apache.lucene.document.LongField}, {@link org.apache.lucene.document.IntField}, + * {@link org.apache.lucene.document.DoubleField} and {@link org.apache.lucene.document.FloatField} + * index values in a points index for efficient range queries, and also create doc-values for these + * fields for efficient sorting and faceting. + * + *
If the field is aimed at being used to tune the score, {@link + * org.apache.lucene.document.FeatureField} helps internally store numeric data as term frequencies + * in a way that makes it efficient to influence scoring at search time. + * + *
It is recommended to index dates as a {@link org.apache.lucene.document.LongField} that stores + * the number of milliseconds since Epoch. + * + *
IP fields can be indexed via {@link org.apache.lucene.document.InetAddressPoint} in addition + * to a {@link org.apache.lucene.document.SortedDocValuesField} (if the field is single-valued) or + * {@link org.apache.lucene.document.SortedSetDocValuesField} that stores the result of {@link + * org.apache.lucene.document.InetAddressPoint#encode}. + * + *
Dense numeric vectors can be indexed with {@link + * org.apache.lucene.document.KnnFloatVectorField} if its dimensions are floating-point numbers or + * {@link org.apache.lucene.document.KnnByteVectorField} if its dimensions are bytes. This allows + * searching for nearest neighbors at search time. + * + *
To perform nearest-neighbor search on sparse vectors rather than dense vectors, each dimension + * of the sparse vector should be indexed as a {@link org.apache.lucene.document.FeatureField}. + * Queries can then be constructed as a {@link org.apache.lucene.search.BooleanQuery} with {@link + * org.apache.lucene.document.FeatureField#newLinearQuery(String, String, float) linear queries} as + * {@link org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses. */ package org.apache.lucene.document; diff --git a/lucene/core/src/java/org/apache/lucene/index/package-info.java b/lucene/core/src/java/org/apache/lucene/index/package-info.java index 5dc4bd9ae6de..c554e081af7d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/index/package-info.java @@ -70,28 +70,33 @@ *
{@link org.apache.lucene.index.IndexReader} is used to read data from the index, and supports - * searching. Many thread-safe readers may be {@link org.apache.lucene.index.DirectoryReader#open} - * concurrently with a single (or no) writer. Each reader maintains a consistent "point in time" - * view of an index and must be explicitly refreshed (see {@link - * org.apache.lucene.index.DirectoryReader#openIfChanged}) in order to incorporate writes that may - * occur after it is opened. + * searching. Many thread-safe readers may be {@link org.apache.lucene.index.DirectoryReader#open + * open} concurrently with a single (or no) writer. Each reader maintains a consistent "point in + * time" view of an index and must be explicitly refreshed (see {@link + * org.apache.lucene.index.DirectoryReader#openIfChanged(DirectoryReader, IndexWriter)}) in order to + * incorporate writes that may occur after it is opened. * *
Lucene's index is composed of segments, each of which contains a subset of all the documents * in the index, and is a complete searchable index in itself, over that subset. As documents are * written to the index, new segments are created and flushed to directory storage. Segments are - * immutable; updates and deletions may only create new segments and do not modify existing ones. - * Over time, the writer merges groups of smaller segments into single larger ones in order to + * composed of an immutable core and per-commit live documents and doc-value updates. Insertions add + * new segments. Deletions and doc-value updates in a given segment create a new segment that shares + * the same core as the previous segment and new live docs for this segment. Updates are implemented + * as an atomic insertion and deletion. + * + *
Over time, the writer merges groups of smaller segments into single larger ones in order to * maintain an index that is efficient to search, and to reclaim dead space left behind by deleted * (and updated) documents. * *
Each document is identified by a 32-bit number, its "docid," and is composed of a collection - * of Field values of diverse types (postings, stored fields, doc values, and points). Docids come - * in two flavors: global and per-segment. A document's global docid is just the sum of its - * per-segment docid and that segment's base docid offset. External, high-level APIs only handle - * global docids, but internal APIs that reference a {@link org.apache.lucene.index.LeafReader}, - * which is a reader for a single segment, deal in per-segment docids. + * of Field values of diverse types (postings, stored fields, term vectors, doc values, points and + * knn vectors). Docids come in two flavors: global and per-segment. A document's global docid is + * just the sum of its per-segment docid and that segment's base docid offset. External, high-level + * APIs only handle global docids, but internal APIs that reference a {@link + * org.apache.lucene.index.LeafReader}, which is a reader for a single segment, deal in per-segment + * docids. * *
Docids are assigned sequentially within each segment (starting at 0). Thus the number of * documents in a segment is the same as its maximum docid; some may be deleted, but their docids @@ -117,45 +122,31 @@ * values given a docid. All stored field values for a document are stored together in a block. * Different types of stored field provide high-level datatypes such as strings and numbers on top * of the underlying bytes. Stored field values are usually retrieved by the searcher using an - * implementation of {@link org.apache.lucene.index.StoredFieldVisitor}. + * implementation of {@link org.apache.lucene.index.StoredFieldVisitor}. + * + *
{@link org.apache.lucene.index.TermVectors} store a per-document inverted index. They are + * useful for finding similar documents, called MoreLikeThis in Lucene. * *
{@link org.apache.lucene.index.DocValues} fields are what are sometimes referred to as * columnar, or column-stride fields, by analogy to relational database terminology, in which * documents are considered as rows, and fields, columns. DocValues fields store values per-field: a * value for every document is held in a single data structure, providing for rapid, sequential * lookup of a field-value given a docid. These fields are used for efficient value-based sorting, - * and for faceting, but they are not useful for filtering. + * for faceting, and sometimes for filtering on the least selective clauses of a query. * *
{@link org.apache.lucene.index.PointValues} represent numeric values using a kd-tree data * structure. Efficient 1- and higher dimensional implementations make these the choice for numeric - * range and interval queries, and geo-spatial queries. - * - *
{@link org.apache.lucene.index.KnnVectorValues} represent dense numeric vectors whose + * dimensions may either be bytes or floats. They are indexed in a way that allows searching for + * nearest neighbors. The vectors are typically produced by a machine-learned model, and used to + * perform semantic search. * - *
{@link org.apache.lucene.index.Fields} is the initial entry point into the postings APIs, this - * can be obtained in several ways: + *
- * // access indexed fields for an index segment - * Fields fields = reader.fields(); - * // access term vector fields for a specified document - * TermVectors vectors = reader.termVectors(); - * Fields fields = vectors.get(docid); - *- * - * Fields implements Java's Iterable interface, so it's easy to enumerate the list of fields: - * - *
- * // enumerate list of fields - * for (String field : fields) { - * // access the terms for this field - * Terms terms = fields.terms(field); - * } - *+ *
+ * Terms terms = leafReader.terms("body"); * // metadata about the field * System.out.println("positions? " + terms.hasPositions()); * System.out.println("offsets? " + terms.hasOffsets()); * System.out.println("payloads? " + terms.hasPayloads()); * // iterate through terms - * TermsEnum termsEnum = terms.iterator(null); + * TermsEnum termsEnum = terms.iterator(); * BytesRef term = null; * while ((term = termsEnum.next()) != null) { - * doSomethingWith(termsEnum.term()); + * doSomethingWith(term); * } ** @@ -188,9 +180,9 @@ * // get the document frequency * System.out.println(termsEnum.docFreq()); * // enumerate through documents - * PostingsEnum docs = termsEnum.postings(null, null); + * PostingsEnum docs = termsEnum.postings(null); * // enumerate through documents and positions - * PostingsEnum docsAndPositions = termsEnum.postings(null, null, PostingsEnum.FLAG_POSITIONS); + * PostingsEnum docsAndPositions = termsEnum.postings(null, PostingsEnum.POSITIONS); * } * * @@ -199,7 +191,7 @@ *
{@link org.apache.lucene.index.PostingsEnum} is an extension of {@link - * org.apache.lucene.search.DocIdSetIterator}that iterates over the list of documents for a term, + * org.apache.lucene.search.DocIdSetIterator} that iterates over the list of documents for a term, * along with the term frequency within that document. * *
@@ -207,12 +199,12 @@ * while ((docid = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { * System.out.println(docid); * System.out.println(docsEnum.freq()); - * } + * } ** * * - *
PostingsEnum also allows iteration of the positions a term occurred within the document, and * any additional per-position information (offsets and payload). The information available is @@ -220,19 +212,41 @@ * *
* int docid; - * PostingsEnum postings = termsEnum.postings(null, null, PostingsEnum.FLAG_PAYLOADS | PostingsEnum.FLAG_OFFSETS); + * PostingsEnum postings = termsEnum.postings(null, PostingsEnum.PAYLOADS | PostingsEnum.OFFSETS); * while ((docid = postings.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { * System.out.println(docid); * int freq = postings.freq(); * for (int i = 0; i < freq; i++) { - * System.out.println(postings.nextPosition()); - * System.out.println(postings.startOffset()); - * System.out.println(postings.endOffset()); - * System.out.println(postings.getPayload()); + * System.out.println(postings.nextPosition()); + * System.out.println(postings.startOffset()); + * System.out.println(postings.endOffset()); + * System.out.println(postings.getPayload()); * } * } ** + *
TermsEnum also allows returning an {@link org.apache.lucene.index.ImpactsEnum}, an extension + * of PostingsEnum that exposes pareto-optimal tuples of (term frequency, length normalization + * factor) per block of postings. It is typically used to compute the maximum possible score over + * these blocks of postings, so that they can be skipped if they cannot possibly produce a + * competitive hit. + * + *
+ * int docid; + * ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS); + * int targetDocID = 420; + * impactsEnum.advanceShallow(targetDocID); + * // These impacts expose pareto-optimal tuples of (termFreq, lengthNorm) over various ranges of doc IDs. + * Impacts impacts = impactsEnum.getImpacts(); + * for (int level = 0; level < impacts.numLevels(); i++) { + * int docIdUpTo = impacts.getDocIdUpTo(level); + * // List of pareto-optimal (termFreq, lengthNorm) tuples between targetDocID inclusive and docIdUpTo inclusive. + * List<Impact> perLevelImpacts = impacts.getImpacts(level); + * } + *+ * * * *
Make sure to look at {@link org.apache.lucene.search.Query} factory methods on {@link + * org.apache.lucene.index.IndexableField}s that you feed into the index writer, they are convenient + * to use and sometimes more efficient than a naively constructed {@link + * org.apache.lucene.search.Query}. See {@link + * org.apache.lucene.document.LongField#newRangeQuery(String, long, long)} for instance. + * *
To perform a search, applications usually call {@link * org.apache.lucene.search.IndexSearcher#search(Query,int)}. * @@ -204,7 +210,8 @@ * documents that need to be scored based on boolean logic in the Query specification, and then * ranks this subset of matching documents via the retrieval model. For some valuable references on * VSM and IR in general refer to Lucene Wiki IR references. + * href="https://cwiki.apache.org/confluence/display/LUCENEJAVA/InformationRetrieval">Lucene Wiki IR + * references. * *
The rest of this document will cover Scoring basics and explain * how to change your {@link org.apache.lucene.search.similarities.Similarity Similarity}. Next, it @@ -253,8 +260,12 @@ * org.apache.lucene.index.IndexWriterConfig#setSimilarity(org.apache.lucene.search.similarities.Similarity) * IndexWriterConfig.setSimilarity(Similarity)} and at query-time with {@link * org.apache.lucene.search.IndexSearcher#setSimilarity(org.apache.lucene.search.similarities.Similarity) - * IndexSearcher.setSimilarity(Similarity)}. Be sure to use the same Similarity at query-time as at - * index-time (so that norms are encoded/decoded correctly); Lucene makes no effort to verify this. + * IndexSearcher.setSimilarity(Similarity)}. Be sure to use search-time similarities that encode the + * length normalization factor the same way as the similarity that you used at index time. All + * Lucene built-in similarities use the default encoding so they are compatible, but if you use a + * custom similarity that changes the encoding of the length normalization factor, you are on your + * own: Lucene makes no effort to ensure that the index-time and the search-time similarities are + * compatible. * *
You can influence scoring by configuring a different built-in Similarity implementation, or by * tweaking its parameters, subclassing it to override behavior. Some implementations also offer a diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/package-info.java b/lucene/core/src/java/org/apache/lucene/search/similarities/package-info.java index 3ba83ba8a085..e9feb1968773 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/package-info.java @@ -76,10 +76,14 @@ * org.apache.lucene.search.similarities.BM25Similarity#BM25Similarity(float,float) b} parameter to * {@code 0}. * - *
To change {@link org.apache.lucene.search.similarities.Similarity}, one must do so for both - * indexing and searching, and the changes must happen before either of these actions take place. - * Although in theory there is nothing stopping you from changing mid-stream, it just isn't - * well-defined what is going to happen. + *
To switch to a {@link org.apache.lucene.search.similarities.Similarity} that encodes the + * length normalization differently, one must do so for both indexing and searching, and the changes + * must happen before either of these actions take place. Note that all of Lucene's built-in + * similarities - and more generally all {@link org.apache.lucene.search.similarities.Similarity} + * sub-classes that don't override {@link + * org.apache.lucene.search.similarities.Similarity#computeNorm(org.apache.lucene.index.FieldInvertState)} + * - encode the length normalization factor the same way, so it is fine to change the similarity at + * search-time without recreating the index. * *
To make this change, implement your own {@link * org.apache.lucene.search.similarities.Similarity} (likely you'll want to simply subclass {@link diff --git a/lucene/core/src/java/overview.html b/lucene/core/src/java/overview.html index e78eff90d442..147b955d5dd8 100644 --- a/lucene/core/src/java/overview.html +++ b/lucene/core/src/java/overview.html @@ -20,7 +20,9 @@
-Apache Lucene is a high-performance, full-featured text search engine library. +
Apache Lucene is a high-performance, full-featured search engine library. +It supports structured search, full-text search, faceting, nearest-neighbor +search across high-dimensionality vectors, spell correction or query suggestions. Here's a simple example how to use Lucene for indexing and searching (using JUnit to check if the results are what we expect):
@@ -31,32 +33,33 @@ Analyzer analyzer = new StandardAnalyzer(); Path indexPath = Files.createTempDirectory("tempIndex"); - Directory directory = FSDirectory.open(indexPath); - IndexWriterConfig config = new IndexWriterConfig(analyzer); - IndexWriter iwriter = new IndexWriter(directory, config); - Document doc = new Document(); - String text = "This is the text to be indexed."; - doc.add(new Field("fieldname", text, TextField.TYPE_STORED)); - iwriter.addDocument(doc); - iwriter.close(); - - // Now search the index: - DirectoryReader ireader = DirectoryReader.open(directory); - IndexSearcher isearcher = new IndexSearcher(ireader); - // Parse a simple query that searches for "text": - QueryParser parser = new QueryParser("fieldname", analyzer); - Query query = parser.parse("text"); - ScoreDoc[] hits = isearcher.search(query, 10).scoreDocs; - assertEquals(1, hits.length); - // Iterate through the results: - StoredFields storedFields = isearcher.storedFields(); - for (int i = 0; i < hits.length; i++) { - Document hitDoc = storedFields.document(hits[i].doc); - assertEquals("This is the text to be indexed.", hitDoc.get("fieldname")); - } - ireader.close(); - directory.close(); - IOUtils.rm(indexPath); + try (Directory directory = FSDirectory.open(indexPath)) { + IndexWriterConfig config = new IndexWriterConfig(analyzer); + try (IndexWriter iwriter = new IndexWriter(directory, config)) { + Document doc = new Document(); + String text = "This is the text to be indexed."; + doc.add(new Field("fieldname", text, TextField.TYPE_STORED)); + iwriter.addDocument(doc); + } + + // Now search the index: + try (DirectoryReader ireader = DirectoryReader.open(directory)) { + IndexSearcher isearcher = new IndexSearcher(ireader); + // Parse a simple query that searches for "text": + QueryParser parser = new QueryParser("fieldname", analyzer); + Query query = parser.parse("text"); + ScoreDoc[] hits = isearcher.search(query, 10).scoreDocs; + assertEquals(1, hits.length); + // Iterate through the results: + StoredFields storedFields = isearcher.storedFields(); + for (int i = 0; i < hits.length; i++) { + Document hitDoc = storedFields.document(hits[i].doc); + assertEquals("This is the text to be indexed.", hitDoc.get("fieldname")); + } + } + } finally { + IOUtils.rm(indexPath); + } @@ -67,7 +70,7 @@