【lucene】使用docvalues的案例

下面給出一段可直接跑通的 Lucene 8.5.0 示例代碼，演示如何

1. 建索引時為兩個字段啟用 DocValues（一個 `NumericDocValues`，一個 `SortedDocValues`）；??

2. 用 `IndexSearcher` 按 DocValues 排序；??

3. 用 `FacetsCollector` 做分組統計（相當于 SQL 的 `GROUP BY`）。

```java

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.codecs.lucene50.Lucene50Codec;

import org.apache.lucene.document.*;

import org.apache.lucene.index.*;

import org.apache.lucene.search.*;

import org.apache.lucene.store.ByteBuffersDirectory;

import org.apache.lucene.util.NumericUtils;

import org.apache.lucene.facet.*;

import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts;

import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField;

public class DocValuesDemo {

? ? public static void main(String[] args) throws Exception {

? ? ? ? /* ---------- 1. 創建內存目錄 ---------- */

? ? ? ? Directory dir = new ByteBuffersDirectory();

? ? ? ? IndexWriterConfig cfg = new IndexWriterConfig(new StandardAnalyzer());

? ? ? ? // 使用默認 codec 即可，DocValues 默認開啟

? ? ? ? cfg.setCodec(new Lucene50Codec());

? ? ? ? IndexWriter writer = new IndexWriter(dir, cfg);

? ? ? ? /* ---------- 2. 寫入三條示范文檔 ---------- */

? ? ? ? Document doc1 = new Document();

? ? ? ? doc1.add(new StringField("id", "1", Field.Store.YES));

? ? ? ? // 商品價：NumericDocValues，可排序、可聚合

? ? ? ? doc1.add(new NumericDocValuesField("price", 2999));

? ? ? ? // 商品品牌：SortedDocValues，可做 faceting

? ? ? ? doc1.add(new SortedSetDocValuesFacetField("brand", "小米"));

? ? ? ? Document doc2 = new Document();

? ? ? ? doc2.add(new StringField("id", "2", Field.Store.YES));

? ? ? ? doc2.add(new NumericDocValuesField("price", 3999));

? ? ? ? doc2.add(new SortedSetDocValuesFacetField("brand", "蘋果"));

? ? ? ? Document doc3 = new Document();

? ? ? ? doc3.add(new StringField("id", "3", Field.Store.YES));

? ? ? ? doc3.add(new NumericDocValuesField("price", 1999));

? ? ? ? doc3.add(new SortedSetDocValuesFacetField("brand", "小米"));

? ? ? ? writer.addDocument(doc1);

? ? ? ? writer.addDocument(doc2);

? ? ? ? writer.addDocument(doc3);

? ? ? ? writer.commit();

? ? ? ? writer.close();

? ? ? ? /* ---------- 3. 打開搜索器 ---------- */

? ? ? ? DirectoryReader reader = DirectoryReader.open(dir);

? ? ? ? IndexSearcher searcher = new IndexSearcher(reader);

? ? ? ? /* ---------- 4. 按價格升序排序 ---------- */

? ? ? ? Sort sort = new Sort(new SortField("price", SortField.Type.LONG));

? ? ? ? TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 10, sort);

? ? ? ? for (ScoreDoc sd : topDocs.scoreDocs) {

? ? ? ? ? ? Document d = searcher.doc(sd.doc);

? ? ? ? ? ? System.out.println("id=" + d.get("id"));

? ? ? ? }

? ? ? ? // 輸出：

? ? ? ? // id=3 (1999)

? ? ? ? // id=1 (2999)

? ? ? ? // id=2 (3999)

? ? ? ? /* ---------- 5. 用 Facets 統計品牌分布 ---------- */

? ? ? ? FacetsCollector fc = new FacetsCollector();

? ? ? ? searcher.search(new MatchAllDocsQuery(), fc);

? ? ? ? Facets facets = new SortedSetDocValuesFacetCounts(fc);

? ? ? ? FacetResult result = facets.getTopChildren(10, "brand");

? ? ? ? for (LabelAndValue lav : result.labelValues) {

? ? ? ? ? ? System.out.println(lav.label + " : " + lav.value);

? ? ? ? }

? ? ? ? // 輸出：

? ? ? ? // 小米 : 2

? ? ? ? // 蘋果 : 1

? ? ? ? reader.close();

? ? ? ? dir.close();

? ? }

}

```

---

關鍵 API 回顧

目標用到的 Lucene 類/方法?

聲明數值 DocValues `NumericDocValuesField`?

聲明字符串 DocValues `SortedSetDocValuesFacetField`?

排序 `SortField("price", Type.LONG)`?

分組/聚合 `FacetsCollector + SortedSetDocValuesFacetCounts`?

這樣就完整展示了在 Lucene 8.5.0 中如何啟用、讀取并利用 DocValues 進行排序和聚合的完整鏈路。

下面給出一份可直接拷貝到 IDE 運行的 Lucene 8.5.0 完整示例，演示如何

1. 索引階段： ?
- 用 NumericDocValues 存價格（可排序/聚合） ?
- 用 SortedSetDocValuesFacetField 存品牌（可做分面統計） ?

2. 查詢階段： ?
- 按價格排序 ?
- 統計品牌分布 ?
- 鉆取（drill-down）過濾后再統計

---

? Maven 依賴（lucene-core + lucene-facet）

```xml
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>8.5.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-facet</artifactId>
<version>8.5.0</version>
</dependency>
```

---

? 代碼示例

```java
package demo;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.facet.*;
import org.apache.lucene.facet.sortedset.*;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;

import java.util.List;

public class DocValuesDemo {
public static void main(String[] args) throws Exception {
Directory dir = new ByteBuffersDirectory();
IndexWriterConfig cfg = new IndexWriterConfig(new StandardAnalyzer());
IndexWriter writer = new IndexWriter(dir, cfg);

? ? ? ? FacetsConfig config = new FacetsConfig(); ? // 必須

? ? ? ? // 準備 3 條測試數據
addDoc(writer, config, "1", 1999, "小米");
addDoc(writer, config, "2", 3999, "蘋果");
addDoc(writer, config, "3", 2999, "小米");

? ? ? ? writer.commit();
writer.close();

? ? ? ? /* ---------- 查詢 ---------- */
DirectoryReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);

? ? ? ? /* 1. 按價格排序（NumericDocValues） */
Sort sort = new Sort(new SortField("price", SortField.Type.LONG));
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 10, sort);
System.out.println("按價格排序：");
for (ScoreDoc sd : topDocs.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println("id=" + doc.get("id") +
", 價格=" + doc.get("price") +
", 品牌=" + doc.get("brand"));
}

? ? ? ? /* 2. 品牌分面統計（SortedSetDocValuesFacetField） */
SortedSetDocValuesReaderState state =
new DefaultSortedSetDocValuesReaderState(reader);
FacetsCollector fc = new FacetsCollector();
FacetsCollector.search(searcher, new MatchAllDocsQuery(), 10, fc);
Facets facets = new SortedSetDocValuesFacetCounts(state, fc);
FacetResult brandResult = facets.getTopChildren(10, "brand");
System.out.println("\n品牌統計：");
for (LabelAndValue lv : brandResult.labelValues) {
System.out.println(lv.label + " : " + lv.value);
}

? ? ? ? /* 3. 鉆取：只看 2010 年出版的書（示例維度） */
DrillDownQuery dq = new DrillDownQuery(config);
dq.add("brand", "小米"); ? ? ? ? ?// 鉆取小米
FacetsCollector fc2 = new FacetsCollector();
FacetsCollector.search(searcher, dq, 10, fc2);
Facets facets2 = new SortedSetDocValuesFacetCounts(state, fc2);
FacetResult afterDrill = facets2.getTopChildren(10, "brand");
System.out.println("\n鉆取后品牌統計：");
System.out.println(afterDrill);

? ? ? ? reader.close();
dir.close();
}

? ? private static void addDoc(IndexWriter w,
FacetsConfig config,
String id,
int price,
String brand) throws Exception {
Document doc = new Document();
doc.add(new StringField("id", id, Field.Store.YES)); ? ? ? ? ? ? ? ? // 普通字段
doc.add(new NumericDocValuesField("price", price)); ? ? ? ? ? ? ? ? ?// 數值 DocValues
doc.add(new SortedSetDocValuesFacetField("brand", brand)); ? ? ? ? ? // 分面 DocValues
w.addDocument(config.build(doc)); ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?// 必須用 FacetsConfig.build
}
}
```

---

? 運行結果示例

```
按價格排序：
id=1, 價格=1999, 品牌=小米
id=3, 價格=2999, 品牌=小米
id=2, 價格=3999, 品牌=蘋果

品牌統計：
小米 : 2
蘋果 : 1

鉆取后品牌統計：
dim=brand path=[小米] value=2 childCount=0
```

---

? 小結

- NumericDocValues → 排序、范圍聚合 ?
- SortedSetDocValuesFacetField + FacetsConfig → 分面統計、鉆取 ?
- 代碼直接基于 Lucene 8.5.0，無需額外 codec 設置。

在 Lucene 中，`doc.get("price")` 返回 `null` 是因為 `price` 字段被定義為 `NumericDocValuesField`，而 `NumericDocValuesField` 不會存儲字段值。

它只用于 DocValues 索引，供排序、聚合等操作使用，但不會把值存入 `Document`。

---

? 解釋

- `NumericDocValuesField`??

? - 用途：用于快速排序、聚合等操作，數據存儲在 `.dvd` 文件中。??

? - 特點：不存儲字段值，只能通過 `IndexSearcher` 的 `DocValues` 接口訪問。

- `StoredField`??

? - 用途：用于存儲字段值，可以在 `Document` 中直接訪問。??

? - 特點：數據存儲在 `.fdt` 文件中。

---

? 解決方法

如果你想在 `Document` 中直接訪問字段值，需要同時添加一個 `StoredField`：

```java

doc.add(new NumericDocValuesField("price", price)); // 用于 DocValues 索引

doc.add(new StoredField("price", price)); // 用于存儲字段值

```

---

? 完整示例

```java

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.*;

import org.apache.lucene.facet.*;

import org.apache.lucene.facet.sortedset.*;

import org.apache.lucene.index.*;

import org.apache.lucene.search.*;

import org.apache.lucene.store.ByteBuffersDirectory;

public class DocValuesDemo {

? ? public static void main(String[] args) throws Exception {

? ? ? ? Directory dir = new ByteBuffersDirectory();

? ? ? ? IndexWriterConfig cfg = new IndexWriterConfig(new StandardAnalyzer());

? ? ? ? IndexWriter writer = new IndexWriter(dir, cfg);

? ? ? ? FacetsConfig config = new FacetsConfig();

? ? ? ? // 添加文檔

? ? ? ? addDoc(writer, config, "1", 1999, "小米");

? ? ? ? addDoc(writer, config, "2", 3999, "蘋果");

? ? ? ? addDoc(writer, config, "3", 2999, "小米");

? ? ? ? writer.commit();

? ? ? ? writer.close();

? ? ? ? // 打開索引

? ? ? ? DirectoryReader reader = DirectoryReader.open(dir);

? ? ? ? IndexSearcher searcher = new IndexSearcher(reader);

? ? ? ? // 按價格排序

? ? ? ? Sort sort = new Sort(new SortField("price", SortField.Type.LONG));

? ? ? ? TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 10, sort);

? ? ? ? System.out.println("按價格排序：");

? ? ? ? for (ScoreDoc sd : topDocs.scoreDocs) {

? ? ? ? ? ? Document doc = searcher.doc(sd.doc);

? ? ? ? ? ? System.out.println("id=" + doc.get("id") +

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?", 價格=" + doc.get("price") + // 這里可以訪問 price

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?", 品牌=" + doc.get("brand"));

? ? ? ? }

? ? ? ? reader.close();

? ? ? ? dir.close();

? ? }

? ? private static void addDoc(IndexWriter w,

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?FacetsConfig config,

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?String id,

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?int price,

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?String brand) throws Exception {

? ? ? ? Document doc = new Document();

? ? ? ? doc.add(new StringField("id", id, Field.Store.YES));

? ? ? ? doc.add(new NumericDocValuesField("price", price)); // 用于 DocValues 索引

? ? ? ? doc.add(new StoredField("price", price)); // 用于存儲字段值

? ? ? ? doc.add(new SortedSetDocValuesFacetField("brand", brand));

? ? ? ? w.addDocument(config.build(doc));

? ? }

}

```

---

? 輸出結果

```

按價格排序：

id=1, 價格=1999, 品牌=小米

id=3, 價格=2999, 品牌=小米

id=2, 價格=3999, 品牌=蘋果

```

---

? 總結

- `NumericDocValuesField`：用于 DocValues 索引，不存儲字段值。??

- `StoredField`：用于存儲字段值，可以在 `Document` 中直接訪問。??

- 如果需要同時支持 DocValues 索引和字段值訪問，需要同時添加這兩個字段。

也就是說docvalues是一種索引，不是存內容的只有storefield設為yes才是存原文的

對，你總結得很準確！

? DocValues 的本質

DocValues 是一種索引結構，用于高效地存儲和訪問列式數據（如數值、字符串等），主要用于以下場景：

1. 排序：快速按字段值排序（如價格、日期等）。

2. 聚合：快速計算字段值的統計信息（如最大值、最小值、平均值等）。

3. 分面統計：快速統計字段值的分布（如品牌、類別等）。

DocValues 不存儲原始文檔內容，而是將字段值編碼到索引文件中（如 `.dvd` 和 `.dvm` 文件），供快速讀取和計算。

---

? `StoredField` 的作用

`StoredField` 用于存儲原始字段值，以便在檢索時可以直接訪問這些值。它的特點如下：

- 存儲原始值：將字段值存儲在 `.fdt` 文件中。

- 可直接訪問：通過 `Document` 對象訪問存儲的字段值。

- 不參與索引：不會影響搜索、排序或聚合操作。

---

? 對比

特性 DocValues StoredField?

用途索引結構，用于排序、聚合、分面統計存儲結構，用于存儲原始字段值?

存儲位置 `.dvd` 和 `.dvm` 文件 `.fdt` 文件?

是否可直接訪問不可直接訪問，需通過 `DocValues` 接口可直接通過 `Document` 訪問?

是否參與索引參與索引，用于快速讀取和計算不參與索引，僅存儲原始值?

---

? 示例代碼

```java

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.*;

import org.apache.lucene.index.*;

import org.apache.lucene.search.*;

import org.apache.lucene.store.ByteBuffersDirectory;

public class DocValuesDemo {

? ? public static void main(String[] args) throws Exception {

? ? ? ? Directory dir = new ByteBuffersDirectory();

? ? ? ? IndexWriterConfig cfg = new IndexWriterConfig(new StandardAnalyzer());

? ? ? ? IndexWriter writer = new IndexWriter(dir, cfg);

? ? ? ? // 添加文檔

? ? ? ? Document doc1 = new Document();

? ? ? ? doc1.add(new StringField("id", "1", Field.Store.YES));

? ? ? ? doc1.add(new NumericDocValuesField("price", 1999)); // DocValues 索引

? ? ? ? doc1.add(new StoredField("price", 1999)); // 存儲原始值

? ? ? ? writer.addDocument(doc1);

? ? ? ? Document doc2 = new Document();

? ? ? ? doc2.add(new StringField("id", "2", Field.Store.YES));

? ? ? ? doc2.add(new NumericDocValuesField("price", 3999)); // DocValues 索引

? ? ? ? doc2.add(new StoredField("price", 3999)); // 存儲原始值

? ? ? ? writer.addDocument(doc2);

? ? ? ? writer.commit();

? ? ? ? writer.close();

? ? ? ? // 打開索引

? ? ? ? DirectoryReader reader = DirectoryReader.open(dir);

? ? ? ? IndexSearcher searcher = new IndexSearcher(reader);

? ? ? ? // 按價格排序

? ? ? ? Sort sort = new Sort(new SortField("price", SortField.Type.LONG));

? ? ? ? TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 10, sort);

? ? ? ? System.out.println("按價格排序：");

? ? ? ? for (ScoreDoc sd : topDocs.scoreDocs) {

? ? ? ? ? ? Document doc = searcher.doc(sd.doc);

? ? ? ? ? ? System.out.println("id=" + doc.get("id") +

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?", 價格=" + doc.get("price")); // 可以直接訪問存儲的字段值

? ? ? ? }

? ? ? ? reader.close();

? ? ? ? dir.close();

? ? }

}

```

---

? 輸出結果

```

按價格排序：

id=1, 價格=1999

id=2, 價格=3999

```

---

? 總結

- DocValues：用于索引，支持高效排序、聚合和分面統計，但不存儲原始值。

- StoredField：用于存儲原始字段值，可在檢索時直接訪問，但不參與索引。

通過同時使用 `NumericDocValuesField` 和 `StoredField`，可以兼顧索引性能和原始值訪問的需求。

【lucene】使用docvalues的案例

相關文章

IntelliJ IDEA 配置 Maven 阿里云鏡像加速源全流程

【worklist】worklist的hl7、dicom是什么關系

位運算-面試題01.01.判定字符是否唯一-力扣(LeetCode)

wsl /lib/x86_64-linux-gnu/libc.so.6: version GLIBC_2.28‘ not found

Windows系統ffmpeg.dll丟失怎么辦？從錯誤分析到永久修復的完整流程

LaTeX 復雜圖形繪制教程：從基礎到進階

用 Go Typed Client 快速上手 Elasticsearch —— 從建索引到聚合的完整實戰

《義龍棒球科普》棒球是韓國的國球嗎·棒球1號位

三坐標測量機路徑規劃與補償技術：如何用算法看見微米級誤差？

Docker設置容器時間

德國威樂集團亞太中東非洲PMO負責人和繼明受邀為PMO大會主持人

idea 集成飛算Java AI 教程

2025.8.1

一種新的分布式ID生成方案--ULID

服務器中涉及節流（Throttle）的硬件組件及其應用注意事項

GitPython07-源碼解讀

Java繼承機制詳解：從原理到實戰應用

工業數采引擎-DTU

AttributeError: ChatGLMTokenizer has no attribute vocab_size

14.串口更新FLASH字庫