diff --git a/.claude/skills/rationalize-deps/SKILL.md b/.claude/skills/rationalize-deps/SKILL.md new file mode 100644 index 00000000000..829a70c67ec --- /dev/null +++ b/.claude/skills/rationalize-deps/SKILL.md @@ -0,0 +1,125 @@ +--- +name: rationalize-deps +description: Analyze Cargo.toml dependencies and attempt to remove unused features to reduce compile times and binary size +--- + +# Rationalize Dependencies + +This skill analyzes Cargo.toml dependencies to identify and remove unused features. + +## Overview + +Many crates enable features by default that may not be needed. This skill: +1. Identifies dependencies with default features enabled +2. Tests if `default-features = false` works +3. Identifies which specific features are actually needed +4. Verifies compilation after changes + +## Step 1: Identify the target + +Ask the user which crate(s) to analyze: +- A specific crate name (e.g., "tokio", "serde") +- A specific workspace member (e.g., "quickwit-search") +- "all" to scan the entire workspace + +## Step 2: Analyze current dependencies + +For the workspace Cargo.toml (`quickwit/Cargo.toml`), list dependencies that: +- Do NOT have `default-features = false` +- Have default features that might be unnecessary + +Run: `cargo tree -p -f "{p} {f}" --edges features` to see what features are actually used. + +## Step 3: For each candidate dependency + +### 3a: Check the crate's default features + +Look up the crate on crates.io or check its Cargo.toml to understand: +- What features are enabled by default +- What each feature provides + +Use: `cargo metadata --format-version=1 | jq '.packages[] | select(.name == "") | .features'` + +### 3b: Try disabling default features + +Modify the dependency in `quickwit/Cargo.toml`: + +From: +```toml +some-crate = { version = "1.0" } +``` + +To: +```toml +some-crate = { version = "1.0", default-features = false } +``` + +### 3c: Run cargo check + +Run: `cargo check --workspace` (or target specific packages for faster feedback) + +If compilation fails: +1. Read the error messages to identify which features are needed +2. Add only the required features explicitly: + ```toml + some-crate = { version = "1.0", default-features = false, features = ["needed-feature"] } + ``` +3. Re-run cargo check + +### 3d: Binary search for minimal features + +If there are many default features, use binary search: +1. Start with no features +2. If it fails, add half the default features +3. Continue until you find the minimal set + +## Step 4: Document findings + +For each dependency analyzed, report: +- Original configuration +- New configuration (if changed) +- Features that were removed +- Any features that are required + +## Step 5: Verify full build + +After all changes, run: +```bash +cargo check --workspace --all-targets +cargo test --workspace --no-run +``` + +## Common Patterns + +### Serde +Often only needs `derive`: +```toml +serde = { version = "1.0", default-features = false, features = ["derive", "std"] } +``` + +### Tokio +Identify which runtime features are actually used: +```toml +tokio = { version = "1.0", default-features = false, features = ["rt-multi-thread", "macros", "sync"] } +``` + +### Reqwest +Often doesn't need all TLS backends: +```toml +reqwest = { version = "0.11", default-features = false, features = ["rustls-tls", "json"] } +``` + +## Rollback + +If changes cause issues: +```bash +git checkout quickwit/Cargo.toml +cargo check --workspace +``` + +## Tips + +- Start with large crates that have many default features (tokio, reqwest, hyper) +- Use `cargo bloat --crates` to identify large dependencies +- Check `cargo tree -d` for duplicate dependencies that might indicate feature conflicts +- Some features are needed only for tests - consider using `[dev-dependencies]` features diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7af5fbda950..dead7aedeca 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -188,28 +188,3 @@ jobs: if: always() && steps.modified.outputs.rust_src == 'true' run: cargo +nightly fmt --all -- --check working-directory: ./quickwit - - thirdparty-license: - name: Check Datadog third-party license file - runs-on: ubuntu-latest - permissions: - contents: read - actions: write - steps: - - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 - - name: Install Rust toolchain - uses: dtolnay/rust-toolchain@f7ccc83f9ed1e5b9c81d8a67d7ad1a747e22a561 # master - with: - toolchain: stable - - - name: Cache cargo tools - uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1 - with: - path: ~/.cargo/bin - key: ${{ runner.os }}-cargo-tools-${{ hashFiles('**/Cargo.lock') }} - - - name: Install dd-rust-license-tool - run: dd-rust-license-tool --help || cargo install --git https://github.com/DataDog/rust-license-tool.git --force - - - name: Check Datadog third-party license file - run: dd-rust-license-tool --config quickwit/license-tool.toml --manifest-path quickwit/Cargo.toml check diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index 13904cb90c2..ed79fbdb132 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -52,7 +52,6 @@ base16ct,https://github.com/RustCrypto/formats/tree/master/base16ct,Apache-2.0 O base64,https://github.com/marshallpierce/rust-base64,MIT OR Apache-2.0,Marshall Pierce base64-simd,https://github.com/Nugine/simd,MIT,The base64-simd Authors base64ct,https://github.com/RustCrypto/formats,Apache-2.0 OR MIT,RustCrypto Developers -bincode,https://github.com/servo/bincode,MIT,"Ty Overby , Francesco Mazzoli , David Tolnay , Zoey Riordan " bit-set,https://github.com/contain-rs/bit-set,Apache-2.0 OR MIT,Alexis Beingessner bit-vec,https://github.com/contain-rs/bit-vec,Apache-2.0 OR MIT,Alexis Beingessner bitflags,https://github.com/bitflags/bitflags,MIT OR Apache-2.0,The Rust Project Developers @@ -104,8 +103,6 @@ crossbeam-utils,https://github.com/crossbeam-rs/crossbeam,MIT OR Apache-2.0,The crunchy,https://github.com/eira-fransham/crunchy,MIT,Eira Fransham crypto-bigint,https://github.com/RustCrypto/crypto-bigint,Apache-2.0 OR MIT,RustCrypto Developers crypto-common,https://github.com/RustCrypto/traits,MIT OR Apache-2.0,RustCrypto Developers -csv,https://github.com/BurntSushi/rust-csv,Unlicense OR MIT,Andrew Gallant -csv-core,https://github.com/BurntSushi/rust-csv,Unlicense OR MIT,Andrew Gallant darling,https://github.com/TedDriggs/darling,MIT,Ted Driggs darling_core,https://github.com/TedDriggs/darling,MIT,Ted Driggs darling_macro,https://github.com/TedDriggs/darling,MIT,Ted Driggs @@ -130,15 +127,7 @@ elliptic-curve,https://github.com/RustCrypto/traits/tree/master/elliptic-curve,A embedded-io,https://github.com/embassy-rs/embedded-io,MIT OR Apache-2.0,The embedded-io Authors embedded-io,https://github.com/rust-embedded/embedded-hal,MIT OR Apache-2.0,The embedded-io Authors encode_unicode,https://github.com/tormol/encode_unicode,Apache-2.0 OR MIT,Torbjørn Birch Moltu -encoding,https://github.com/lifthrasiir/rust-encoding,MIT,Kang Seonghoon -encoding-index-japanese,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-korean,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-simpchinese,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-singlebyte,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-tradchinese,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding_index_tests,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon encoding_rs,https://github.com/hsivonen/encoding_rs,(Apache-2.0 OR MIT) AND BSD-3-Clause,Henri Sivonen -encoding_rs_io,https://github.com/BurntSushi/encoding_rs_io,MIT OR Apache-2.0,Andrew Gallant enum-iterator,https://github.com/stephaneyfx/enum-iterator,0BSD,Stephane Raux enum-iterator-derive,https://github.com/stephaneyfx/enum-iterator,0BSD,Stephane Raux env_filter,https://github.com/rust-cli/env_logger,MIT OR Apache-2.0,The env_filter Authors @@ -150,7 +139,6 @@ fail,https://github.com/tikv/fail-rs,Apache-2.0,The TiKV Project Developers fastdivide,https://github.com/fulmicoton/fastdivide,zlib-acknowledgement OR MIT,Paul Masurel fastrand,https://github.com/smol-rs/fastrand,Apache-2.0 OR MIT,Stjepan Glavina ff,https://github.com/zkcrypto/ff,MIT OR Apache-2.0,"Sean Bowe , Jack Grigg " -filetime,https://github.com/alexcrichton/filetime,MIT OR Apache-2.0,Alex Crichton find-msvc-tools,https://github.com/rust-lang/cc-rs,MIT OR Apache-2.0,The find-msvc-tools Authors fixedbitset,https://github.com/petgraph/fixedbitset,MIT OR Apache-2.0,bluss flate2,https://github.com/rust-lang/flate2-rs,MIT OR Apache-2.0,"Alex Crichton , Josh Triplett " @@ -224,8 +212,6 @@ is-terminal,https://github.com/sunfishcode/is-terminal,MIT,"softprops -jiff,https://github.com/BurntSushi/jiff,Unlicense OR MIT,Andrew Gallant -jiff-static,https://github.com/BurntSushi/jiff,Unlicense OR MIT,Andrew Gallant jobserver,https://github.com/rust-lang/jobserver-rs,MIT OR Apache-2.0,Alex Crichton js-sys,https://github.com/wasm-bindgen/wasm-bindgen/tree/master/crates/js-sys,MIT OR Apache-2.0,The wasm-bindgen Developers json_comments,https://github.com/tmccombs/json-comments-rs,Apache-2.0,Thayne McCombs @@ -233,19 +219,6 @@ lazy_static,https://github.com/rust-lang-nursery/lazy-static.rs,MIT OR Apache-2. levenshtein_automata,https://github.com/tantivy-search/levenshtein-automata,MIT,Paul Masurel libc,https://github.com/rust-lang/libc,MIT OR Apache-2.0,The Rust Project Developers libm,https://github.com/rust-lang/compiler-builtins,MIT,Jorge Aparicio -libredox,https://gitlab.redox-os.org/redox-os/libredox,MIT,4lDO2 <4lDO2@protonmail.com> -lindera-cc-cedict,https://github.com/lindera-morphology/lindera,MIT,The lindera-cc-cedict Authors -lindera-cc-cedict-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-cc-cedict-builder Authors -lindera-core,https://github.com/lindera-morphology/lindera,MIT,The lindera-core Authors -lindera-decompress,https://github.com/lindera-morphology/lindera,MIT,The lindera-decompress Authors -lindera-dictionary,https://github.com/lindera-morphology/lindera,MIT,The lindera-dictionary Authors -lindera-ipadic,https://github.com/lindera-morphology/lindera,MIT,The lindera-ipadic Authors -lindera-ipadic-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-ipadic-builder Authors -lindera-ipadic-neologd-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-ipadic-neologd-builder Authors -lindera-ko-dic,https://github.com/lindera-morphology/lindera,MIT,The lindera-ko-dic Authors -lindera-ko-dic-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-ko-dic-builder Authors -lindera-tokenizer,https://github.com/lindera-morphology/lindera,MIT,The lindera-tokenizer Authors -lindera-unidic-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-unidic-builder Authors linked-hash-map,https://github.com/contain-rs/linked-hash-map,MIT OR Apache-2.0,"Stepan Koltsov , Andrew Paseltiner " linux-raw-sys,https://github.com/sunfishcode/linux-raw-sys,Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT,Dan Gohman litemap,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers @@ -330,7 +303,6 @@ pnet_packet,https://github.com/libpnet/libpnet,MIT OR Apache-2.0,Robert Clipsham pnet_sys,https://github.com/libpnet/libpnet,MIT OR Apache-2.0,"Robert Clipsham , Linus Färnstrand " pnet_transport,https://github.com/libpnet/libpnet,MIT OR Apache-2.0,Robert Clipsham portable-atomic,https://github.com/taiki-e/portable-atomic,Apache-2.0 OR MIT,The portable-atomic Authors -portable-atomic-util,https://github.com/taiki-e/portable-atomic,Apache-2.0 OR MIT,The portable-atomic-util Authors postcard,https://github.com/jamesmunns/postcard,MIT OR Apache-2.0,James Munns potential_utf,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers powerfmt,https://github.com/jhpratt/powerfmt,MIT OR Apache-2.0,Jacob Pratt @@ -353,8 +325,6 @@ prost,https://github.com/tokio-rs/prost,Apache-2.0,"Dan Burkert , Lucio Franco , Casper Meijn , Tokio Contributors " prost-derive,https://github.com/tokio-rs/prost,Apache-2.0,"Dan Burkert , Lucio Franco , Casper Meijn , Tokio Contributors " prost-types,https://github.com/tokio-rs/prost,Apache-2.0,"Dan Burkert , Lucio Franco , Casper Meijn , Tokio Contributors " -protobuf,https://github.com/stepancheg/rust-protobuf,MIT,Stepan Koltsov -protobuf-support,https://github.com/stepancheg/rust-protobuf,MIT,Stepan Koltsov pulldown-cmark,https://github.com/raphlinus/pulldown-cmark,MIT,"Raph Levien , Marcus Klaas de Vries " pulldown-cmark-to-cmark,https://github.com/Byron/pulldown-cmark-to-cmark,Apache-2.0,"Sebastian Thiel , Dylan Owen , Alessandro Ogier , Zixian Cai <2891235+caizixian@users.noreply.github.com>, Andrew Lyjak " quanta,https://github.com/metrics-rs/quanta,MIT,Toby Lawrence @@ -388,7 +358,6 @@ roxmltree,https://github.com/RazrFalcon/roxmltree,MIT OR Apache-2.0,Evgeniy Reiz rust-embed,https://pyrossh.dev/repos/rust-embed,MIT,pyrossh rust-embed-impl,https://pyrossh.dev/repos/rust-embed,MIT,pyrossh rust-embed-utils,https://pyrossh.dev/repos/rust-embed,MIT,pyrossh -rust-stemmers,https://github.com/CurrySoftware/rust-stemmers,MIT OR BSD-3-Clause,"Jakob Demler , CurrySoftware " rustc-hash,https://github.com/rust-lang/rustc-hash,Apache-2.0 OR MIT,The Rust Project Developers rustix,https://github.com/bytecodealliance/rustix,Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT,"Dan Gohman , Jakub Konka " rustls,https://github.com/rustls/rustls,Apache-2.0 OR ISC OR MIT,The rustls Authors @@ -448,8 +417,6 @@ syn,https://github.com/dtolnay/syn,MIT OR Apache-2.0,David Tolnay synstructure,https://github.com/mystor/synstructure,MIT,Nika Layzell sysinfo,https://github.com/GuillaumeGomez/sysinfo,MIT,Guillaume Gomez -system-configuration,https://github.com/mullvad/system-configuration-rs,MIT OR Apache-2.0,Mullvad VPN -system-configuration-sys,https://github.com/mullvad/system-configuration-rs,MIT OR Apache-2.0,Mullvad VPN tabled,https://github.com/zhiburt/tabled,MIT,Maxim Zhiburt tabled_derive,https://github.com/zhiburt/tabled,MIT,Maxim Zhiburt tantivy,https://github.com/quickwit-oss/tantivy,MIT,Paul Masurel @@ -545,7 +512,6 @@ wasmtimer,https://github.com/whizsid/wasmtimer-rs,MIT,"WhizSid web-sys,https://github.com/wasm-bindgen/wasm-bindgen/tree/master/crates/web-sys,MIT OR Apache-2.0,The wasm-bindgen Developers web-time,https://github.com/daxpedda/web-time,MIT OR Apache-2.0,The web-time Authors webpki-roots,https://github.com/rustls/webpki-roots,CDLA-Permissive-2.0,The webpki-roots Authors -whichlang,https://github.com/quickwit-oss/whichlang,MIT,"Quickwit, Inc. " winapi,https://github.com/retep998/winapi-rs,MIT,Peter Atashian winapi,https://github.com/retep998/winapi-rs,MIT OR Apache-2.0,Peter Atashian winapi-i686-pc-windows-gnu,https://github.com/retep998/winapi-rs,MIT OR Apache-2.0,Peter Atashian @@ -561,7 +527,6 @@ windows-interface,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-link,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,Microsoft windows-link,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-link Authors windows-numerics,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-numerics Authors -windows-registry,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-registry Authors windows-result,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,Microsoft windows-result,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-result Authors windows-strings,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,Microsoft @@ -590,9 +555,7 @@ windows_x86_64_msvc,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,Th winnow,https://github.com/winnow-rs/winnow,MIT,The winnow Authors wit-bindgen,https://github.com/bytecodealliance/wit-bindgen,Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT,Alex Crichton writeable,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers -xattr,https://github.com/Stebalien/xattr,MIT OR Apache-2.0,Steven Allen xmlparser,https://github.com/RazrFalcon/xmlparser,MIT OR Apache-2.0,Yevhenii Reizner -yada,https://github.com/takuyaa/yada,MIT OR Apache-2.0,Takuya Asano yansi,https://github.com/SergioBenitez/yansi,MIT OR Apache-2.0,Sergio Benitez yoke,https://github.com/unicode-org/icu4x,Unicode-3.0,Manish Goregaokar yoke-derive,https://github.com/unicode-org/icu4x,Unicode-3.0,Manish Goregaokar diff --git a/docs/configuration/index-config.md b/docs/configuration/index-config.md index 24ce8677902..c8f26ded709 100644 --- a/docs/configuration/index-config.md +++ b/docs/configuration/index-config.md @@ -94,6 +94,7 @@ The doc mapping defines how a document and the fields it contains are stored and | `tag_fields` | Collection of fields* explicitly defined in `field_mappings` whose values will be stored as part of the `tags` metadata. Allowed types are: `text` (with raw tokenizer), `i64` and `u64`. [Learn more about tags](../overview/concepts/querying.md#tag-pruning). | `[]` | | `store_source` | Whether or not the original JSON document is stored or not in the index. | `false` | | `timestamp_field` | Timestamp field* used for sharding documents in splits. The field has to be of type `datetime`. [Learn more about time sharding](./../overview/architecture.md). | `None` | +| `indexation_time_field` | Field with that will hold the indexation time of the document. This field is populated during indexation. The field has to be of type `datetime`. | `None` | | `partition_key` | If set, quickwit will route documents into different splits depending on the field name declared as the `partition_key`. | `null` | | `max_num_partitions` | Limits the number of splits created through partitioning. (See [Partitioning](../overview/concepts/querying.md#partitioning)) | `200` | | `index_field_presence` | `exists` queries are enabled automatically for fast fields. To enable it for all other fields set this parameter to `true`. Enabling it can have a significant CPU-cost on indexing. | false | diff --git a/docs/reference/es_compatible_api.md b/docs/reference/es_compatible_api.md index 32cbdafd761..885ac39e67b 100644 --- a/docs/reference/es_compatible_api.md +++ b/docs/reference/es_compatible_api.md @@ -187,11 +187,12 @@ It is also possible to not supply an order and rely on the default order using t } ``` -If no format is provided for timestamps, timestamps are returned with milliseconds precision. - -If you need nanosecond precision, you can use the `epoch_nanos_int` format. Beware this means the resulting -JSON may contain high numbers for which there is loss of precision when using languages where all numbers are -floats, such as JavaScript. +Fields explicitly specified as `datetime` in the doc mapping also support an +output format. If no format is provided, timestamps are returned with +milliseconds precision. If you need nanosecond precision, you can use the +`epoch_nanos_int` format. Beware, this means the resulting JSON may contain high +numbers for which there is loss of precision when using languages where all +numbers are floats, such as JavaScript. ```json { @@ -237,6 +238,40 @@ You can pass the `sort` value of the last hit in a subsequent request where othe This allows you to paginate your results. + +#### Note regarding multi-type pagination + +Pagination can get tricky on fields that have multiple types. In dynamic fields, multiple column types can be present for a given field within a single split. When using doc mapping updates, any type combination can be present across split. + +First, let's take a look at the various type systems we are working with. + +The JSON representation used for the sort values provides the following primitive types: +- numerical +- bool +- string + +Tantivy uses the following types: +- i64 / u64 / f64 (only one of these can be present in a split) +- datetime +- string +- bool +- ip (not supported in sort yet) +- bytes (not supported in sort yet) + +Elasticsearch can represent date field sort values in various formats. In Quickwit, only integer formats are supported (millisecond or nanosecond). Either way, the fact that datetime can live along with another type inside a split yields unreliable pagination: +- Because there isn't a simple and efficient common representation in the fast field u64 space, it's hard to represent datetime within the numerical (i64/u64/f64) order. +- To paginate separately across numerical and datetime types a strongly typed representation of the json sort key would be necessary. + +The current implementation does the following: +- If the mapping is explicitly set to datetime and never changed, pagination works as expected. +- If the mapping evolved to datetime, pagination fails for splits that contain numerical values (i64, u64, f64 columns). +- If the mapping is a json/dynamic field, pagination fails for splits that contain a datetime column. This can happen because on JSON field Tantivy automatically stores RFC3339 date strings in a datetime column. +- If other types are mixed, the sort will iterate over all values type by type + - Asc: numeric -> string -> boolean -> datetime -> null + - Desc: datetime -> boolean -> string -> numeric -> null +- Quickwit used to support specifying numbers as string in the search after value. That isn't possible anymore. + + ### `_msearch`   Multi search API ``` diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index a0f47b86c7d..56cd57bca98 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -1186,15 +1186,6 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d809780667f4410e7c41b07f52439b94d2bdf8528eeedc287fa38d3b7f95d82" -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - [[package]] name = "bindgen" version = "0.72.1" @@ -1376,9 +1367,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" dependencies = [ "serde", ] @@ -1529,9 +1520,8 @@ dependencies = [ [[package]] name = "chitchat" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "735f8a51f68b353b17e351b38317433d6afcaa9cc04f4d0f6c9e9125c49c1efe" +version = "0.9.0" +source = "git+https://github.com/quickwit-oss/chitchat.git?rev=bd54c81#bd54c810700814f83599a31a7e29f2a5eb8324b3" dependencies = [ "anyhow", "async-trait", @@ -2226,6 +2216,12 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be1e0bca6c3637f992fc1cc7cbc52a78c1ef6db076dbf1059c4323d6a2048376" +[[package]] +name = "datasketches" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c286de4e81ea2590afc24d754e0f83810c566f50a1388fa75ebd57928c0d9745" + [[package]] name = "dbl" version = "0.3.2" @@ -2312,8 +2308,6 @@ checksum = "25f104b501bf2364e78d0d3974cbc774f738f5865306ed128e1e0d7499c0ad96" dependencies = [ "console", "shell-words", - "tempfile", - "zeroize", ] [[package]] @@ -2598,70 +2592,6 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" -[[package]] -name = "encoding" -version = "0.2.33" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" -dependencies = [ - "encoding-index-japanese", - "encoding-index-korean", - "encoding-index-simpchinese", - "encoding-index-singlebyte", - "encoding-index-tradchinese", -] - -[[package]] -name = "encoding-index-japanese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-korean" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-simpchinese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-singlebyte" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-tradchinese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding_index_tests" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" - [[package]] name = "encoding_rs" version = "0.8.35" @@ -2671,15 +2601,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "encoding_rs_io" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" -dependencies = [ - "encoding_rs", -] - [[package]] name = "enum-iterator" version = "2.3.0" @@ -2700,29 +2621,6 @@ dependencies = [ "syn 2.0.114", ] -[[package]] -name = "env_filter" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bf3c259d255ca70051b30e2e95b5446cdb8949ac4cd22c0d7fd634d89f568e2" -dependencies = [ - "log", - "regex", -] - -[[package]] -name = "env_logger" -version = "0.11.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f" -dependencies = [ - "anstream", - "anstyle", - "env_filter", - "jiff", - "log", -] - [[package]] name = "equator" version = "0.4.2" @@ -2892,18 +2790,6 @@ version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" -[[package]] -name = "filetime" -version = "0.2.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc0505cd1b6fa6580283f6bdf70a73fcf4aba1184038c90902b92b3dd0df63ed" -dependencies = [ - "cfg-if", - "libc", - "libredox", - "windows-sys 0.60.2", -] - [[package]] name = "find-msvc-tools" version = "0.1.6" @@ -3824,21 +3710,9 @@ dependencies = [ "percent-encoding", "pin-project-lite", "socket2 0.6.1", - "system-configuration", "tokio", - "tower-layer", "tower-service", "tracing", - "windows-registry", -] - -[[package]] -name = "hyperloglogplus" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "621debdf94dcac33e50475fdd76d34d5ea9c0362a834b9db08c3024696c1fbe3" -dependencies = [ - "serde", ] [[package]] @@ -4292,9 +4166,9 @@ dependencies = [ [[package]] name = "keccak" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc2af9a1119c51f12a14607e783cb977bde58bc069ff0c3da1095e635d70654" +checksum = "cb26cec98cce3a3d96cbb7bced3c4b16e3d13f27ec56dbd62cbc8f39cfb9d653" dependencies = [ "cpufeatures", ] @@ -4418,219 +4292,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "lindera-cc-cedict" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7595a377b9723e837711366721b02662dac64d734af3dac1c01941e779e95a6b" -dependencies = [ - "bincode", - "byteorder", - "encoding", - "flate2", - "lindera-cc-cedict-builder", - "lindera-core", - "once_cell", - "tar", - "ureq", -] - -[[package]] -name = "lindera-cc-cedict-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c6fbd76a65b5df73574898e871d7cff3e34bf89f544f6e1a1087cba82e25cce" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - -[[package]] -name = "lindera-core" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85af015d15c25cb3b7af82ba181908f4afbec6a2636f0fdfcca6d173c1b2c7fe" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "encoding_rs", - "log", - "once_cell", - "serde", - "thiserror 1.0.69", - "yada", -] - -[[package]] -name = "lindera-decompress" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3dfc054b2f3f3eb21a24ce062a3d5f969339ddf50652038ea33993b1b97d4ba" -dependencies = [ - "anyhow", - "flate2", - "serde", -] - -[[package]] -name = "lindera-dictionary" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6b1a5d8f4cba37dcca18dc0e827233ff46695a6d878d716f16f755d264d588a" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "lindera-cc-cedict", - "lindera-cc-cedict-builder", - "lindera-core", - "lindera-ipadic", - "lindera-ipadic-builder", - "lindera-ipadic-neologd-builder", - "lindera-ko-dic", - "lindera-ko-dic-builder", - "lindera-unidic-builder", - "serde", -] - -[[package]] -name = "lindera-ipadic" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e5f1d26aba22d8a9193dcd2d087205d89e0ffb19490bc305b341e25c037f353" -dependencies = [ - "bincode", - "byteorder", - "encoding", - "flate2", - "lindera-core", - "lindera-ipadic-builder", - "once_cell", - "tar", - "ureq", -] - -[[package]] -name = "lindera-ipadic-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "184a9769b05ae857bd55f5e8a94b2ae2ba8816c5c6b78c73f161b4d7490c0461" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding_rs", - "encoding_rs_io", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "serde", - "yada", -] - -[[package]] -name = "lindera-ipadic-neologd-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b8cd28b5402425184d0f719d5bd81af87a7e36e2032b5bcceddf55011b1b22c" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding_rs", - "encoding_rs_io", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "serde", - "yada", -] - -[[package]] -name = "lindera-ko-dic" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6d718720a28ac5d93b449661d8844f7858b2b71595e3198bc90e437f01e5ce" -dependencies = [ - "bincode", - "byteorder", - "encoding", - "flate2", - "lindera-core", - "lindera-ko-dic-builder", - "once_cell", - "tar", - "ureq", -] - -[[package]] -name = "lindera-ko-dic-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f22de1fcdc33de258037145ae86686125214206b98d04c6dfe01f36c136c0022" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - -[[package]] -name = "lindera-tokenizer" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cca45cbc1af512ce2aa9dea9a1d694430480a53bb53e37165ba143e27e81f7dd" -dependencies = [ - "bincode", - "lindera-core", - "lindera-dictionary", - "once_cell", - "serde", - "serde_json", -] - -[[package]] -name = "lindera-unidic-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "359425c8dff54164ff1b068122d26df358ce18533e4771eb5c5ce68888d988f2" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - [[package]] name = "linked-hash-map" version = "0.5.6" @@ -4731,6 +4392,12 @@ dependencies = [ "twox-hash", ] +[[package]] +name = "lz4_flex" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db9a0d582c2874f68138a16ce1867e0ffde6c0bb0a0df85e1f36d04146db488a" + [[package]] name = "matchers" version = "0.2.0" @@ -5090,9 +4757,9 @@ dependencies = [ [[package]] name = "num-conv" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" [[package]] name = "num-format" @@ -5297,9 +4964,9 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "oneshot" -version = "0.1.11" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ce411919553d3f9fa53a0880544cda985a112117a0444d5ff1e870a893d6ea" +checksum = "269bca4c2591a28585d6bf10d9ed0332b7d76900a1b02bec41bdc3a2cdcda107" [[package]] name = "onig" @@ -5598,7 +5265,8 @@ checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" [[package]] name = "ownedbytes" version = "0.9.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fbd56f7631767e61784dc43f8580f403f4475bd4aaa4da003e6295e1bab4a7e" dependencies = [ "stable_deref_trait", ] @@ -6364,7 +6032,6 @@ dependencies = [ "memchr", "parking_lot 0.12.5", "procfs", - "protobuf", "thiserror 2.0.17", ] @@ -6504,26 +6171,6 @@ dependencies = [ "prost 0.14.1", ] -[[package]] -name = "protobuf" -version = "3.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d65a1d4ddae7d8b5de68153b48f6aa3bba8cb002b243dbdbc55a5afbc98f99f4" -dependencies = [ - "once_cell", - "protobuf-support", - "thiserror 1.0.69", -] - -[[package]] -name = "protobuf-support" -version = "3.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e36c2f31e0a47f9280fb347ef5e461ffcd2c52dd520d8e216b52f93b0b0d7d6" -dependencies = [ - "thiserror 1.0.69", -] - [[package]] name = "psl" version = "2.1.176" @@ -6719,7 +6366,7 @@ dependencies = [ "quickwit-cluster", "quickwit-common", "quickwit-config", - "quickwit-doc-mapper", + "quickwit-directories", "quickwit-index-management", "quickwit-indexing", "quickwit-ingest", @@ -6734,6 +6381,7 @@ dependencies = [ "rustls 0.23.36", "serde_json", "tabled", + "tantivy", "tempfile", "thiserror 2.0.17", "thousands", @@ -6825,7 +6473,6 @@ dependencies = [ "bytesize", "coarsetime", "dyn-clone", - "env_logger", "fnv", "futures", "home", @@ -6856,6 +6503,7 @@ dependencies = [ "tonic 0.14.2", "tower 0.5.2", "tracing", + "tracing-subscriber", ] [[package]] @@ -6897,6 +6545,7 @@ version = "0.8.0" dependencies = [ "anyhow", "async-trait", + "base64 0.22.1", "bytesize", "fnv", "futures", @@ -6905,6 +6554,7 @@ dependencies = [ "mockall", "once_cell", "proptest", + "prost 0.14.1", "quickwit-actors", "quickwit-cluster", "quickwit-common", @@ -6917,6 +6567,7 @@ dependencies = [ "serde", "serde_json", "smallvec", + "time", "tokio", "tracing", "ulid", @@ -6977,6 +6628,7 @@ dependencies = [ "serde_yaml", "siphasher", "tantivy", + "tantivy-fst", "thiserror 2.0.17", "time", "tracing", @@ -7328,9 +6980,6 @@ dependencies = [ "bitpacking", "criterion", "hex", - "lindera-core", - "lindera-dictionary", - "lindera-tokenizer", "once_cell", "proptest", "quickwit-common", @@ -7346,7 +6995,6 @@ dependencies = [ "thiserror 2.0.17", "time", "tracing", - "whichlang", ] [[package]] @@ -7445,6 +7093,7 @@ dependencies = [ "mockall", "once_cell", "percent-encoding", + "pin-project", "pprof", "prost 0.14.1", "prost-types 0.14.1", @@ -8195,16 +7844,6 @@ dependencies = [ "walkdir", ] -[[package]] -name = "rust-stemmers" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" -dependencies = [ - "serde", - "serde_derive", -] - [[package]] name = "rust_decimal" version = "1.39.0" @@ -8956,9 +8595,9 @@ checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" [[package]] name = "sketches-ddsketch" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1e9a774a6c28142ac54bb25d25562e6bcf957493a184f15ad4eebccb23e410a" +checksum = "05e40b6cf54d988dc1a2223531b969c9a9e30906ad90ef64890c27b4bfbb46ea" dependencies = [ "serde", ] @@ -9428,27 +9067,6 @@ dependencies = [ "nom 8.0.0", ] -[[package]] -name = "system-configuration" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" -dependencies = [ - "bitflags 2.10.0", - "core-foundation 0.9.4", - "system-configuration-sys", -] - -[[package]] -name = "system-configuration-sys" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" -dependencies = [ - "core-foundation-sys", - "libc", -] - [[package]] name = "tabled" version = "0.20.0" @@ -9483,8 +9101,9 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" [[package]] name = "tantivy" -version = "0.26.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edde6a10743fff00a4e1a8c9ef020bf5f3cbad301b7d2d39f2b07f123c4eac07" dependencies = [ "aho-corasick", "arc-swap", @@ -9495,6 +9114,7 @@ dependencies = [ "census", "crc32fast", "crossbeam-channel", + "datasketches", "downcast-rs", "fastdivide", "fnv", @@ -9502,19 +9122,17 @@ dependencies = [ "futures-channel", "futures-util", "htmlescape", - "hyperloglogplus", "itertools 0.14.0", "levenshtein_automata", "log", - "lru 0.12.5", - "lz4_flex", + "lru 0.16.3", + "lz4_flex 0.13.0", "measure_time", "memmap2", "once_cell", "oneshot", "rayon", "regex", - "rust-stemmers", "rustc-hash", "serde", "serde_json", @@ -9539,16 +9157,18 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" -version = "0.9.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fed3d674429bcd2de5d0a6d1aa5495fed8afd9c5ecce993019caf7615f53fa4" dependencies = [ "bitpacking", ] [[package]] name = "tantivy-columnar" -version = "0.6.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c57166f5bcfd478f370ab8445afb4678dce44801fa5ce5c451aaf8595583c5dc" dependencies = [ "downcast-rs", "fastdivide", @@ -9562,8 +9182,9 @@ dependencies = [ [[package]] name = "tantivy-common" -version = "0.10.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbf10915aa75da3c3b0d58b58853d2e889efbaf32d4982a4c3715dde6bba23e5" dependencies = [ "async-trait", "byteorder", @@ -9584,8 +9205,9 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" -version = "0.25.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfadb8526b6da90704feb293b0701a6aae62ea14983143344be2dc5ce30f1d82" dependencies = [ "fnv", "nom 7.1.3", @@ -9596,8 +9218,9 @@ dependencies = [ [[package]] name = "tantivy-sstable" -version = "0.6.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a2cfc3ac5164cbadc28965ffb145a8f47582a60ae5897859ad8d4316596c606" dependencies = [ "futures-util", "itertools 0.14.0", @@ -9609,8 +9232,9 @@ dependencies = [ [[package]] name = "tantivy-stacker" -version = "0.6.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6cbb051742da9d53ca9e8fff43a9b10e319338b24e2c0e15d0372df19ffeb951" dependencies = [ "murmurhash32", "tantivy-common", @@ -9618,21 +9242,11 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" -version = "0.6.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" -dependencies = [ - "serde", -] - -[[package]] -name = "tar" -version = "0.4.44" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a" +checksum = "eac258c2c6390673f2685813afeeafcb8c4e0ee7de8dd3fc46838dcc37263f98" dependencies = [ - "filetime", - "libc", - "xattr", + "serde", ] [[package]] @@ -9770,9 +9384,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.44" +version = "0.3.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ "deranged", "itoa", @@ -9781,16 +9395,16 @@ dependencies = [ "num-conv", "num_threads", "powerfmt", - "serde", + "serde_core", "time-core", "time-macros", ] [[package]] name = "time-core" -version = "0.1.6" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" [[package]] name = "time-fmt" @@ -9804,9 +9418,9 @@ dependencies = [ [[package]] name = "time-macros" -version = "0.2.24" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" dependencies = [ "num-conv", "time-core", @@ -9950,10 +9564,7 @@ dependencies = [ "futures-core", "futures-io", "futures-sink", - "futures-util", - "hashbrown 0.15.5", "pin-project-lite", - "slab", "tokio", ] @@ -10496,21 +10107,6 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" -[[package]] -name = "ureq" -version = "2.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" -dependencies = [ - "base64 0.22.1", - "log", - "once_cell", - "rustls 0.23.36", - "rustls-pki-types", - "url", - "webpki-roots 0.26.11", -] - [[package]] name = "url" version = "2.5.8" @@ -10679,7 +10275,7 @@ dependencies = [ "jsonschema", "lalrpop", "lalrpop-util", - "lz4_flex", + "lz4_flex 0.11.5", "md-5", "nom 8.0.0", "nom-language", @@ -10984,12 +10580,6 @@ dependencies = [ "rustls-pki-types", ] -[[package]] -name = "whichlang" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b9aa3ad29c3d08283ac6b769e3ec15ad1ddb88af7d2e9bc402c574973b937e7" - [[package]] name = "whoami" version = "1.6.1" @@ -11146,17 +10736,6 @@ dependencies = [ "windows-link 0.1.3", ] -[[package]] -name = "windows-registry" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" -dependencies = [ - "windows-link 0.2.1", - "windows-result 0.4.1", - "windows-strings 0.5.1", -] - [[package]] name = "windows-result" version = "0.3.4" @@ -11496,16 +11075,6 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" -[[package]] -name = "xattr" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" -dependencies = [ - "libc", - "rustix 1.1.3", -] - [[package]] name = "xmlparser" version = "0.13.6" @@ -11518,12 +11087,6 @@ version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" -[[package]] -name = "yada" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" - [[package]] name = "yansi" version = "1.0.1" diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 453b5850761..dbf75513736 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -91,7 +91,7 @@ bitpacking = "0.9.3" bytes = { version = "1", features = ["serde"] } bytesize = { version = "1.3", features = ["serde"] } bytestring = "1.5" -chitchat = "0.10.0" +chitchat = { git = "https://github.com/quickwit-oss/chitchat.git", rev = "bd54c81" } chrono = { version = "0.4", default-features = false, features = [ "clock", "std", @@ -102,11 +102,11 @@ colored = "3.0" console-subscriber = "0.5" criterion = { version = "0.8", features = ["async_tokio"] } cron = "0.15" -dialoguer = "0.12" +dialoguer = { version = "0.12", default-features = false } dotenvy = "0.15" dyn-clone = "1.0" enum-iterator = "2.3" -env_logger = "0.11" +env_logger = { version = "0.11", default-features = false, features = ["auto-color"] } fail = "0.5" flate2 = "1.1" flume = "0.12" @@ -131,23 +131,18 @@ http-serde = "2.1" humantime = "2.3" hyper = { version = "1.8", features = ["client", "http1", "http2", "server"] } hyper-rustls = "0.27" -hyper-util = { version = "0.1", features = ["full"] } +hyper-util = { version = "0.1", default-features = false, features = [ + "client-legacy", + "server-auto", + "server-graceful", + "service", + "tokio", +] } indexmap = { version = "2.12", features = ["serde"] } indicatif = "0.18" itertools = "0.14" json_comments = "0.2" libz-sys = "1.1" -# Lindera tokenizer 0.30+ versions (tested up to 0.32.3) are currently broken due to upstream build failures. -# The dictionary crates attempt to download artifacts from S3 URLs that return 404 Not Found. -# Version 0.29.0 is the latest version that builds correctly. It also explicitly depends on lindera-core 0.29 -# and lindera-dictionary 0.29. -lindera-core = "0.29" -lindera-dictionary = "0.29" -lindera-tokenizer = { version = "0.29", features = [ - "cc-cedict", - "ipadic", - "ko-dic", -] } lru = "0.16" matches = "0.1" md5 = "0.8" @@ -175,7 +170,7 @@ pprof = { version = "0.15", features = ["flamegraph"] } predicates = "3" prettyplease = "0.2" proc-macro2 = "1.0" -prometheus = { version = "0.14", features = ["process"] } +prometheus = { version = "0.14", default-features = false, features = ["process"] } proptest = "1" prost = { version = "0.14", default-features = false, features = [ "derive", @@ -245,7 +240,10 @@ tokio = { version = "1.48", features = ["full"] } tokio-metrics = { version = "0.4", features = ["rt"] } tokio-rustls = { version = "0.26", default-features = false } tokio-stream = { version = "0.1", features = ["sync"] } -tokio-util = { version = "0.7", features = ["full"] } +tokio-util = { version = "0.7", default-features = false, features = [ + "compat", + "io-util", +] } toml = "0.9" tonic = { version = "0.14", features = [ "_tls-any", @@ -295,9 +293,8 @@ vrl = { version = "0.29", default-features = false, features = [ "value", ] } warp = { version = "0.4", features = ["server", "test"] } -whichlang = "0.1" wiremock = "0.6" -zstd = "0.13" +zstd = { version = "0.13", default-features = false } aws-config = "1.8" aws-credential-types = { version = "1.2", features = ["hardcoded-credentials"] } @@ -356,7 +353,7 @@ quickwit-storage = { path = "quickwit-storage" } quickwit-telemetry = { path = "quickwit-telemetry" } -tantivy = { git = "https://github.com/SekoiaLab/tantivy/", rev = "e9aede4", default-features = false, features = [ +tantivy = { version = "0.26.1", default-features = false, features = [ "lz4-compression", "mmap", "quickwit", diff --git a/quickwit/quickwit-actors/src/actor.rs b/quickwit/quickwit-actors/src/actor.rs index 2fa32d7f2a5..bb5a48239a4 100644 --- a/quickwit/quickwit-actors/src/actor.rs +++ b/quickwit/quickwit-actors/src/actor.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use async_trait::async_trait; use thiserror::Error; -use tracing::error; use crate::{ActorContext, QueueCapacity, SendError}; diff --git a/quickwit/quickwit-aws/src/error.rs b/quickwit/quickwit-aws/src/error.rs index 53983e5fcc6..97e44f55f1d 100644 --- a/quickwit/quickwit-aws/src/error.rs +++ b/quickwit/quickwit-aws/src/error.rs @@ -35,7 +35,7 @@ where E: AwsRetryable match self { SdkError::ConstructionFailure(_) => false, SdkError::TimeoutError(_) => true, - SdkError::DispatchFailure(_) => false, + SdkError::DispatchFailure(error) => error.is_io() || error.is_timeout(), SdkError::ResponseError(_) => true, SdkError::ServiceError(error) => error.err().is_retryable(), _ => false, diff --git a/quickwit/quickwit-cli/Cargo.toml b/quickwit/quickwit-cli/Cargo.toml index c595cb7e90a..ce598a417ee 100644 --- a/quickwit/quickwit-cli/Cargo.toml +++ b/quickwit/quickwit-cli/Cargo.toml @@ -58,8 +58,8 @@ tracing-subscriber = { workspace = true } quickwit-actors = { workspace = true } quickwit-cluster = { workspace = true } quickwit-common = { workspace = true } +quickwit-directories = { workspace = true } quickwit-config = { workspace = true } -quickwit-doc-mapper = { workspace = true } quickwit-index-management = { workspace = true } quickwit-indexing = { workspace = true } quickwit-ingest = { workspace = true } @@ -70,6 +70,7 @@ quickwit-search = { workspace = true } quickwit-serve = { workspace = true } quickwit-storage = { workspace = true } quickwit-telemetry = { workspace = true } +tantivy = { workspace = true } [dev-dependencies] predicates = { workspace = true } @@ -105,7 +106,6 @@ release-feature-set = [ "quickwit-storage/azure", "quickwit-storage/gcs", "quickwit-metastore/postgres", - "quickwit-doc-mapper/multilang", ] release-feature-vendored-set = [ "jemalloc", @@ -119,7 +119,6 @@ release-feature-vendored-set = [ "quickwit-storage/azure", "quickwit-storage/gcs", "quickwit-metastore/postgres", - "quickwit-doc-mapper/multilang", ] release-macos-feature-vendored-set = [ "jemalloc", @@ -132,13 +131,8 @@ release-macos-feature-vendored-set = [ "quickwit-storage/azure", "quickwit-storage/gcs", "quickwit-metastore/postgres", - "quickwit-doc-mapper/multilang", ] release-jemalloc-profiled = [ "release-feature-set", "jemalloc-profiled", ] - -[package.metadata.cargo-machete] -# used to enable the `multilang` feature -ignored = ["quickwit-doc-mapper"] diff --git a/quickwit/quickwit-cli/src/cli.rs b/quickwit/quickwit-cli/src/cli.rs index 91bb338ae89..9532ad230bc 100644 --- a/quickwit/quickwit-cli/src/cli.rs +++ b/quickwit/quickwit-cli/src/cli.rs @@ -18,6 +18,7 @@ use quickwit_serve::EnvFilterReloadFn; use tracing::Level; use crate::index::{IndexCliCommand, build_index_command}; +use crate::maintenance::{MaintenanceCliCommand, build_maintenance_command}; use crate::service::{RunCliCommand, build_run_command}; use crate::source::{SourceCliCommand, build_source_command}; use crate::split::{SplitCliCommand, build_split_command}; @@ -47,6 +48,7 @@ pub fn build_cli() -> Command { .subcommand(build_source_command().display_order(3)) .subcommand(build_split_command().display_order(4)) .subcommand(build_tool_command().display_order(5)) + .subcommand(build_maintenance_command().display_order(6)) .arg_required_else_help(true) .disable_help_subcommand(true) .subcommand_required(true) @@ -59,6 +61,7 @@ pub enum CliCommand { Split(SplitCliCommand), Source(SourceCliCommand), Tool(ToolCliCommand), + Maintenance(MaintenanceCliCommand), } impl CliCommand { @@ -69,6 +72,7 @@ impl CliCommand { CliCommand::Source(_) => Level::ERROR, CliCommand::Split(_) => Level::ERROR, CliCommand::Tool(_) => Level::ERROR, + CliCommand::Maintenance(_) => Level::ERROR, } } @@ -82,6 +86,9 @@ impl CliCommand { "source" => SourceCliCommand::parse_cli_args(submatches).map(CliCommand::Source), "split" => SplitCliCommand::parse_cli_args(submatches).map(CliCommand::Split), "tool" => ToolCliCommand::parse_cli_args(submatches).map(CliCommand::Tool), + "maintenance" => { + MaintenanceCliCommand::parse_cli_args(submatches).map(CliCommand::Maintenance) + } _ => bail!("unknown command `{subcommand}`"), } } @@ -93,6 +100,7 @@ impl CliCommand { CliCommand::Source(subcommand) => subcommand.execute().await, CliCommand::Split(subcommand) => subcommand.execute().await, CliCommand::Tool(subcommand) => subcommand.execute().await, + CliCommand::Maintenance(subcommand) => subcommand.execute().await, } } } diff --git a/quickwit/quickwit-cli/src/lib.rs b/quickwit/quickwit-cli/src/lib.rs index aaeb4da7e9d..e05d5dc25aa 100644 --- a/quickwit/quickwit-cli/src/lib.rs +++ b/quickwit/quickwit-cli/src/lib.rs @@ -50,6 +50,7 @@ pub mod index; #[cfg(feature = "jemalloc")] pub mod jemalloc; pub mod logger; +pub mod maintenance; pub mod metrics; pub mod service; pub mod source; diff --git a/quickwit/quickwit-cli/src/maintenance.rs b/quickwit/quickwit-cli/src/maintenance.rs new file mode 100644 index 00000000000..d639d19b4d0 --- /dev/null +++ b/quickwit/quickwit-cli/src/maintenance.rs @@ -0,0 +1,149 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Context, bail}; +use clap::{ArgMatches, Command}; +use colored::Colorize; +use tracing::debug; + +use crate::checklist::{GREEN_COLOR, RED_COLOR}; +use crate::{ClientArgs, client_args}; + +pub fn build_maintenance_command() -> Command { + Command::new("maintenance") + .about("Manages cluster maintenance mode for safe rolling upgrades.") + .args(client_args()) + .subcommand(Command::new("enable").about( + "Enables maintenance mode. Freezes the indexing plan; metadata mutations are accepted \ + but the plan is not rebuilt.", + )) + .subcommand( + Command::new("disable") + .about("Disables maintenance mode and triggers a full indexing plan rebuild."), + ) + .subcommand(Command::new("status").about("Shows the current maintenance mode status.")) + .subcommand_required(true) + .arg_required_else_help(true) +} + +#[derive(Debug, PartialEq)] +pub struct EnableMaintenanceArgs { + pub client_args: ClientArgs, +} + +#[derive(Debug, PartialEq)] +pub struct DisableMaintenanceArgs { + pub client_args: ClientArgs, +} + +#[derive(Debug, PartialEq)] +pub struct MaintenanceStatusArgs { + pub client_args: ClientArgs, +} + +#[derive(Debug, PartialEq)] +pub enum MaintenanceCliCommand { + Enable(EnableMaintenanceArgs), + Disable(DisableMaintenanceArgs), + Status(MaintenanceStatusArgs), +} + +impl MaintenanceCliCommand { + pub fn parse_cli_args(mut matches: ArgMatches) -> anyhow::Result { + let (subcommand, submatches) = matches + .remove_subcommand() + .context("failed to parse maintenance subcommand")?; + match subcommand.as_str() { + "enable" => Self::parse_enable_args(submatches), + "disable" => Self::parse_disable_args(submatches), + "status" => Self::parse_status_args(submatches), + _ => bail!("unknown maintenance subcommand `{subcommand}`"), + } + } + + fn parse_enable_args(mut matches: ArgMatches) -> anyhow::Result { + let client_args = ClientArgs::parse(&mut matches)?; + Ok(Self::Enable(EnableMaintenanceArgs { client_args })) + } + + fn parse_disable_args(mut matches: ArgMatches) -> anyhow::Result { + let client_args = ClientArgs::parse(&mut matches)?; + Ok(Self::Disable(DisableMaintenanceArgs { client_args })) + } + + fn parse_status_args(mut matches: ArgMatches) -> anyhow::Result { + let client_args = ClientArgs::parse(&mut matches)?; + Ok(Self::Status(MaintenanceStatusArgs { client_args })) + } + + pub fn default_log_level(&self) -> tracing::Level { + tracing::Level::ERROR + } + + pub async fn execute(self) -> anyhow::Result<()> { + match self { + Self::Enable(args) => enable_maintenance_cli(args).await, + Self::Disable(args) => disable_maintenance_cli(args).await, + Self::Status(args) => maintenance_status_cli(args).await, + } + } +} + +async fn enable_maintenance_cli(args: EnableMaintenanceArgs) -> anyhow::Result<()> { + debug!(args=?args, "enable-maintenance"); + println!("❯ Enabling maintenance mode..."); + let qw_client = args.client_args.client(); + let response = qw_client.maintenance().enable().await?; + println!( + "{} Maintenance mode enabled. Indexing plan frozen.", + "✔".color(GREEN_COLOR) + ); + debug!(frozen_plan_json_len = response.frozen_plan_json.len()); + Ok(()) +} + +async fn disable_maintenance_cli(args: DisableMaintenanceArgs) -> anyhow::Result<()> { + debug!(args=?args, "disable-maintenance"); + println!("❯ Disabling maintenance mode..."); + let qw_client = args.client_args.client(); + qw_client.maintenance().disable().await?; + println!( + "{} Maintenance mode disabled. Indexing plan rebuild triggered.", + "✔".color(GREEN_COLOR) + ); + Ok(()) +} + +async fn maintenance_status_cli(args: MaintenanceStatusArgs) -> anyhow::Result<()> { + debug!(args=?args, "maintenance-status"); + let qw_client = args.client_args.client(); + let status = qw_client.maintenance().status().await?; + if status.is_maintenance_mode { + println!( + "{} Maintenance mode is {}", + "●".color(RED_COLOR), + "ENABLED".color(RED_COLOR).bold() + ); + if let Some(enabled_at) = status.enabled_at { + println!(" Enabled at: {enabled_at}"); + } + } else { + println!( + "{} Maintenance mode is {}", + "●".color(GREEN_COLOR), + "DISABLED".color(GREEN_COLOR).bold() + ); + } + Ok(()) +} diff --git a/quickwit/quickwit-cli/src/tool.rs b/quickwit/quickwit-cli/src/tool.rs index d32db8a9e45..70993405353 100644 --- a/quickwit/quickwit-cli/src/tool.rs +++ b/quickwit/quickwit-cli/src/tool.rs @@ -21,6 +21,7 @@ use std::time::{Duration, Instant}; use std::{env, fmt, io}; use anyhow::{Context, bail}; +use bytesize::ByteSize; use clap::{ArgMatches, Command, arg}; use colored::{ColoredString, Colorize}; use humantime::format_duration; @@ -36,9 +37,11 @@ use quickwit_config::{ CLI_SOURCE_ID, IndexerConfig, NodeConfig, SourceConfig, SourceInputFormat, SourceParams, TransformConfig, VecSourceParams, }; +use quickwit_directories::BundleDirectory; use quickwit_index_management::{IndexService, clear_cache_directory}; use quickwit_indexing::IndexingPipeline; use quickwit_indexing::actors::{IndexingService, MergePipeline, MergeSchedulerService}; +use quickwit_indexing::mature_merge::{MatureMergeConfig, merge_mature_all_indexes}; use quickwit_indexing::models::{ DetachIndexingPipeline, DetachMergePipeline, IndexingStatistics, SpawnPipeline, }; @@ -53,6 +56,9 @@ use quickwit_serve::{ BodyFormat, SearchRequestQueryString, SortBy, search_request_from_api_request, }; use quickwit_storage::{BundleStorage, Storage}; +use tantivy::Index; +use tantivy::directory::FileSlice; +use tantivy::schema::FieldType; use thousands::Separable; use tracing::{debug, info}; @@ -135,6 +141,16 @@ pub fn build_tool_command() -> Command { arg!(--"target-dir" "Directory to extract the split to."), ]) ) + .subcommand( + Command::new("analyze-split-file") + .about("Analyze a local split file.") + .long_about("Analyzes the contents of a local .split file. Does not require a node config or metastore access.") + .args(&[ + arg!(--"split-file" "Path to the local .split file to analyze.") + .display_order(1) + .required(true), + ]) + ) .subcommand( Command::new("gc") .display_order(10) @@ -163,6 +179,60 @@ pub fn build_tool_command() -> Command { .required(true), ]) ) + .subcommand( + Command::new("merge-mature") + .display_order(10) + .about("Merges mature splits across all indexes and nodes.") + .long_about( + "Scans indexes for merge opportunities in mature Published splits. Considers \ + opportunities across all origin nodes and sources. Runs once and exits." + ) + .args(&[ + arg!(--"dry-run" + "Prints the planned merge operations without executing them.") + .required(false), + arg!(--"max-concurrent-merges" + "Maximum number of merges to run concurrently (default: 10).") + .display_order(1) + .required(false), + arg!(--"retention-safety-buffer-days" + "Splits within this many days of the retention cutoff are excluded (default: 5).") + .display_order(2) + .required(false), + arg!(--"min-merge-group-size" + "Minimum number of splits in a group to trigger a merge (default: 5).") + .display_order(3) + .required(false), + arg!(--"input-split-max-num-docs" + "Maximum number of docs in a split for it to be eligible (default: 10_000).") + .display_order(4) + .required(false), + arg!(--"max-merge-group-size" + "Maximum number of splits per merge operation (default: 100).") + .display_order(5) + .required(false), + arg!(--"split-target-num-docs" + "Maximum total docs per merge operation (default: 5_000_000).") + .display_order(6) + .required(false), + arg!(--"split-timestamp-days-range" + "Group splits that span this many days together (0 = single-day, default: 0).") + .display_order(7) + .required(false), + arg!(--"index-parallelism" + "Number of indexes processed concurrently (default: 50).") + .display_order(8) + .required(false), + arg!(--"index-id-patterns" + "Comma-separated list of index ID patterns to include (default: '*').") + .display_order(9) + .required(false), + arg!(--"metrics" + "Expose Prometheus metrics on the REST listen address during the run.") + .display_order(10) + .required(false), + ]) + ) .arg_required_else_help(true) } @@ -207,6 +277,13 @@ pub struct MergeArgs { pub source_id: SourceId, } +#[derive(Debug, Eq, PartialEq)] +pub struct MatureMergeArgs { + pub config_uri: Uri, + pub merge_config: MatureMergeConfig, + pub serve_metrics: bool, +} + #[derive(Debug, Eq, PartialEq)] pub struct ExtractSplitArgs { pub config_uri: Uri, @@ -215,13 +292,20 @@ pub struct ExtractSplitArgs { pub target_dir: PathBuf, } +#[derive(Debug, Eq, PartialEq)] +pub struct AnalyzeSplitFileArgs { + pub split_file: PathBuf, +} + #[derive(Debug, Eq, PartialEq)] pub enum ToolCliCommand { GarbageCollect(GarbageCollectIndexArgs), LocalIngest(LocalIngestDocsArgs), LocalSearch(LocalSearchArgs), Merge(MergeArgs), + MatureMerge(MatureMergeArgs), ExtractSplit(ExtractSplitArgs), + AnalyzeSplitFile(AnalyzeSplitFileArgs), } impl ToolCliCommand { @@ -234,7 +318,9 @@ impl ToolCliCommand { "local-ingest" => Self::parse_local_ingest_args(submatches), "local-search" => Self::parse_local_search_args(submatches), "merge" => Self::parse_merge_args(submatches), + "merge-mature" => Self::parse_mature_merge_args(submatches), "extract-split" => Self::parse_extract_split_args(submatches), + "analyze-split-file" => Self::analyze_split_file_args(submatches), _ => bail!("unknown tool subcommand `{subcommand}`"), } } @@ -385,13 +471,100 @@ impl ToolCliCommand { })) } + fn parse_mature_merge_args(mut matches: ArgMatches) -> anyhow::Result { + let config_uri = matches + .remove_one::("config") + .map(|uri_str| Uri::from_str(&uri_str)) + .expect("`config` should be a required arg.")?; + let defaults = MatureMergeConfig::default(); + let dry_run = matches.get_flag("dry-run"); + let max_concurrent_merges = matches + .remove_one::("max-concurrent-merges") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.max_concurrent_merges); + let retention_safety_buffer_days = matches + .remove_one::("retention-safety-buffer-days") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.retention_safety_buffer_days); + let min_merge_group_size = matches + .remove_one::("min-merge-group-size") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.min_merge_group_size); + let input_split_max_num_docs = matches + .remove_one::("input-split-max-num-docs") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.input_split_max_num_docs); + let max_merge_group_size = matches + .remove_one::("max-merge-group-size") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.max_merge_group_size); + let split_target_num_docs = matches + .remove_one::("split-target-num-docs") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.split_target_num_docs); + let split_timestamp_days_range = matches + .remove_one::("split-timestamp-days-range") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.split_timestamp_days_range); + let index_parallelism = matches + .remove_one::("index-parallelism") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.index_parallelism); + let index_id_patterns = matches + .remove_one::("index-id-patterns") + .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) + .unwrap_or(defaults.index_id_patterns); + let serve_metrics = matches.get_flag("metrics"); + + if max_concurrent_merges == 0 { + bail!("`max-concurrent-merges` must be greater than or equal to 1."); + } + if index_parallelism == 0 { + bail!("`index-parallelism` must be greater than or equal to 1."); + } + Ok(Self::MatureMerge(MatureMergeArgs { + config_uri, + serve_metrics, + merge_config: MatureMergeConfig { + dry_run, + max_concurrent_merges, + retention_safety_buffer_days, + min_merge_group_size, + input_split_max_num_docs, + max_merge_group_size, + split_target_num_docs, + split_timestamp_days_range, + index_parallelism, + index_id_patterns, + }, + })) + } + + fn analyze_split_file_args(mut matches: ArgMatches) -> anyhow::Result { + let split_file = matches + .remove_one::("split-file") + .map(PathBuf::from) + .expect("`split-file` should be a required arg."); + Ok(Self::AnalyzeSplitFile(AnalyzeSplitFileArgs { split_file })) + } + pub async fn execute(self) -> anyhow::Result<()> { match self { Self::GarbageCollect(args) => garbage_collect_index_cli(args).await, Self::LocalIngest(args) => local_ingest_docs_cli(args).await, Self::LocalSearch(args) => local_search_cli(args).await, Self::Merge(args) => merge_cli(args).await, + Self::MatureMerge(args) => merge_mature_cli(args).await, Self::ExtractSplit(args) => extract_split_cli(args).await, + Self::AnalyzeSplitFile(args) => analyze_split_file_cli(args).await, } } } @@ -555,7 +728,7 @@ pub async fn local_search_cli(args: LocalSearchArgs) -> anyhow::Result<()> { split_id: None, }; let search_request = - search_request_from_api_request(vec![args.index_id], search_request_query_string)?; + search_request_from_api_request(vec![args.index_id], search_request_query_string, None)?; debug!(search_request=?search_request, "search-request"); let search_response: SearchResponse = single_node_search(search_request, metastore, storage_resolver).await?; @@ -651,6 +824,43 @@ pub async fn merge_cli(args: MergeArgs) -> anyhow::Result<()> { Ok(()) } +pub async fn merge_mature_cli(args: MatureMergeArgs) -> anyhow::Result<()> { + debug!(args=?args, "merge-mature"); + info!(merge_config=?args.merge_config, "merge-mature configuration"); + println!("❯ Scanning all indexes for mature merge opportunities..."); + let config = load_node_config(&args.config_uri).await?; + let (storage_resolver, metastore_resolver) = + get_resolvers(&config.storage_configs, &config.metastore_configs); + let metastore = metastore_resolver.resolve(&config.metastore_uri).await?; + + let runtimes_config = RuntimesConfig::default(); + start_actor_runtimes( + runtimes_config, + &HashSet::from_iter([QuickwitService::Indexer]), + )?; + + if args.serve_metrics { + let metrics_addr = config.rest_config.listen_addr; + tokio::spawn(serve_metrics(metrics_addr)); + } + + merge_mature_all_indexes( + metastore, + storage_resolver, + &config.data_dir_path, + args.merge_config.clone(), + config.node_id, + ) + .await?; + + if !args.merge_config.dry_run { + info!("mature splits successfully merged, waiting for explicit termination signal"); + tokio::time::sleep(Duration::MAX).await; + } + + Ok(()) +} + pub async fn garbage_collect_index_cli(args: GarbageCollectIndexArgs) -> anyhow::Result<()> { debug!(args=?args, "garbage-collect-index"); println!("❯ Garbage collecting index..."); @@ -748,6 +958,149 @@ async fn extract_split_cli(args: ExtractSplitArgs) -> anyhow::Result<()> { Ok(()) } +fn print_per_field( + label: &str, + usage: &tantivy::space_usage::PerFieldSpaceUsage, + // Per-JSON-field sub-key breakdown: field_name -> sorted (sub_key, bytes) + json_sub_keys: &std::collections::HashMap>, +) { + let total = usage.total().get_bytes(); + if total == 0 { + return; + } + let mut fields: Vec<_> = usage.fields().collect(); + fields.sort_by_key(|f| std::cmp::Reverse(f.total())); + println!(" {label:<14} {}", ByteSize(total)); + for field in &fields { + println!( + " {:<40} {}", + field.field_name(), + ByteSize(field.total().get_bytes()) + ); + if let Some(sub_keys) = json_sub_keys.get(field.field_name()) { + for (key, bytes) in sub_keys { + println!(" {:<38} {}", key, ByteSize(*bytes)); + } + } + } +} + +async fn analyze_split_file_cli(args: AnalyzeSplitFileArgs) -> anyhow::Result<()> { + debug!(args=?args, "extract-split-file"); + println!("❯ Extracting split file..."); + + let split_file_path = args.split_file.canonicalize().with_context(|| { + format!( + "failed to resolve split file path `{}`", + args.split_file.display() + ) + })?; + let split_data_vec = std::fs::read(&split_file_path) + .with_context(|| format!("failed to read split file `{}`", split_file_path.display()))?; + + // --- Tantivy space usage analysis --- + let file_slice = FileSlice::from(split_data_vec); + match BundleDirectory::open_split(file_slice) + .and_then(|dir| Index::open(dir).map_err(std::io::Error::other)) + { + Ok(index) => { + let reader = index.reader()?; + let searcher = reader.searcher(); + let seg_reader = searcher.segment_reader(0); + let schema = index.schema(); + let usage = searcher.space_usage()?; + if let Some(seg) = usage.segments().first() { + println!("\n{} docs:", seg.num_docs()); + + // Scan each JSON field's term dictionary and accumulate postings / positions + // bytes separately per top-level sub-key. + // Result maps: field_name -> sorted Vec<(sub_key, bytes)> + let mut postings_sub_keys: std::collections::HashMap> = + std::collections::HashMap::new(); + let mut positions_sub_keys: std::collections::HashMap> = + std::collections::HashMap::new(); + for (field, field_entry) in schema.fields() { + if !matches!(field_entry.field_type(), FieldType::JsonObject(_)) { + continue; + } + let inv_index = seg_reader.inverted_index(field)?; + let mut stream = inv_index.terms().stream()?; + let mut postings_per_key: std::collections::BTreeMap = + std::collections::BTreeMap::new(); + let mut positions_per_key: std::collections::BTreeMap = + std::collections::BTreeMap::new(); + // Term key layout for JSON fields: [path bytes][0x00][value type][value bytes] + // Path segments are separated by 0x01. No type-code prefix in the SSTable key. + while let Some((key_bytes, term_info)) = stream.next() { + let path_end = key_bytes + .iter() + .position(|&b| b == 0x00) + .unwrap_or(key_bytes.len()); + let path_bytes = &key_bytes[..path_end]; + // Replace 0x01 segment separators with '.' to reconstruct the full path. + let full_path: String = path_bytes + .split(|&b| b == 0x01) + .map(|seg| std::str::from_utf8(seg).unwrap_or("")) + .collect::>() + .join("."); + let top_key = full_path; + *postings_per_key.entry(top_key.clone()).or_default() += + term_info.postings_range.len() as u64; + *positions_per_key.entry(top_key).or_default() += + term_info.positions_range.len() as u64; + } + let field_name = field_entry.name().to_string(); + let mut postings_sorted: Vec<_> = postings_per_key.into_iter().collect(); + postings_sorted.sort_by(|a, b| b.1.cmp(&a.1)); + postings_sub_keys.insert(field_name.clone(), postings_sorted); + let mut positions_sorted: Vec<_> = positions_per_key.into_iter().collect(); + positions_sorted.retain(|(_, b)| *b > 0); + positions_sorted.sort_by(|a, b| b.1.cmp(&a.1)); + positions_sub_keys.insert(field_name, positions_sorted); + } + + print_per_field( + "term dict", + seg.termdict(), + &std::collections::HashMap::new(), + ); + print_per_field("postings", seg.postings(), &postings_sub_keys); + print_per_field("positions", seg.positions(), &positions_sub_keys); + print_per_field( + "fast fields", + seg.fast_fields(), + &std::collections::HashMap::new(), + ); + print_per_field( + "field norms", + seg.fieldnorms(), + &std::collections::HashMap::new(), + ); + let store = seg.store(); + let store_total = store.total().get_bytes(); + if store_total > 0 { + println!(" {:<14} {}", "store", ByteSize(store_total)); + println!( + " {:<40} {}", + "data", + ByteSize(store.data_usage().get_bytes()) + ); + println!( + " {:<40} {}", + "offsets", + ByteSize(store.offsets_usage().get_bytes()) + ); + } + println!(); + } + } + Err(err) => { + debug!("could not open split as tantivy index for space analysis: {err:#}"); + } + } + Ok(()) +} + /// Starts a tokio task that displays the indexing statistics /// every once in awhile. pub async fn start_statistics_reporting_loop( @@ -955,3 +1308,48 @@ async fn create_empty_cluster(config: &NodeConfig) -> anyhow::Result { Ok(cluster) } + +/// A shortcut to expose the metrics without loading the whole quickwit_serve +/// machinery. +async fn serve_metrics(addr: std::net::SocketAddr) { + use tokio::io::{AsyncReadExt, AsyncWriteExt}; + let listener = match tokio::net::TcpListener::bind(addr).await { + Ok(l) => l, + Err(err) => { + tracing::warn!("metrics server could not bind to {addr}: {err}"); + return; + } + }; + tracing::info!("metrics server listening on http://{addr}/metrics"); + loop { + let Ok((mut stream, _peer)) = listener.accept().await else { + continue; + }; + tokio::spawn(async move { + let mut buf = [0u8; 4096]; + let n = match stream.read(&mut buf).await { + Ok(n) => n, + Err(_) => return, + }; + let request = std::str::from_utf8(&buf[..n]).unwrap_or(""); + let is_metrics = request.starts_with("GET /metrics"); + let (status, body) = if is_metrics { + match quickwit_common::metrics::metrics_text_payload() { + Ok(payload) => ("200 OK", payload), + Err(e) => { + tracing::error!("failed to encode prometheus metrics: {e}"); + ("500 Internal Server Error", String::new()) + } + } + } else { + ("404 Not Found", String::new()) + }; + let response = format!( + "HTTP/1.1 {status}\r\nContent-Type: text/plain; version=0.0.4\r\nContent-Length: \ + {}\r\nConnection: close\r\n\r\n{body}", + body.len() + ); + let _ = stream.write_all(response.as_bytes()).await; + }); + } +} diff --git a/quickwit/quickwit-cluster/src/grpc_gossip.rs b/quickwit/quickwit-cluster/src/grpc_gossip.rs index 10be33970db..e974a975118 100644 --- a/quickwit/quickwit-cluster/src/grpc_gossip.rs +++ b/quickwit/quickwit-cluster/src/grpc_gossip.rs @@ -147,7 +147,7 @@ async fn perform_grpc_gossip_rounds( }, ) }); - chitchat_guard.reset_node_state_if_update( + chitchat_guard.reset_node_state( &chitchat_id, key_values, proto_node_state.max_version, diff --git a/quickwit/quickwit-common/Cargo.toml b/quickwit/quickwit-common/Cargo.toml index fe8c066c171..351b456a9ea 100644 --- a/quickwit/quickwit-common/Cargo.toml +++ b/quickwit/quickwit-common/Cargo.toml @@ -18,7 +18,6 @@ backtrace = { workspace = true, optional = true } bytesize = { workspace = true } coarsetime = { workspace = true } dyn-clone = { workspace = true } -env_logger = { workspace = true } fnv = { workspace = true } futures = { workspace = true } home = { workspace = true } @@ -50,9 +49,12 @@ tonic = { workspace = true, features = [ ] } tower = { workspace = true } tracing = { workspace = true } +tracing-subscriber = { workspace = true } [features] -testsuite = ["hyper-util"] +testsuite = [ + "hyper-util", +] named_tasks = ["tokio/tracing"] jemalloc-profiled = [ "named_tasks", diff --git a/quickwit/quickwit-common/src/lib.rs b/quickwit/quickwit-common/src/lib.rs index 0f3af2bc5ba..7feff4976ce 100644 --- a/quickwit/quickwit-common/src/lib.rs +++ b/quickwit/quickwit-common/src/lib.rs @@ -27,6 +27,7 @@ pub mod jemalloc_profiled; mod kill_switch; pub mod metrics; pub mod net; +pub mod numeric_types; mod path_hasher; pub mod pretty; mod progress; @@ -89,7 +90,10 @@ pub fn into_u64_range(range: Range) -> Range { } pub fn setup_logging_for_tests() { - let _ = env_logger::builder().format_timestamp(None).try_init(); + let _ = tracing_subscriber::fmt() + .with_test_writer() + .with_max_level(tracing::Level::INFO) + .try_init(); } pub fn split_file(split_id: impl Display) -> String { diff --git a/quickwit/quickwit-common/src/numeric_types.rs b/quickwit/quickwit-common/src/numeric_types.rs new file mode 100644 index 00000000000..cf4028f2888 --- /dev/null +++ b/quickwit/quickwit-common/src/numeric_types.rs @@ -0,0 +1,470 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! This module is copied over from Tantivy + +/// This module helps compare numerical values of different types (i64, u64 +/// and f64). +pub mod num_cmp { + use std::cmp::Ordering; + + pub fn cmp_i64_f64(left_i: i64, right_f: f64) -> Result { + if right_f.is_nan() { + return Err("NaN comparison is not supported".to_string()); + } + + // If right_f is < i64::MIN then left_i > right_f (i64::MIN=-2^63 can be + // exactly represented as f64) + if right_f < i64::MIN as f64 { + return Ok(Ordering::Greater); + } + // If right_f is >= i64::MAX then left_i < right_f (i64::MAX=2^63-1 cannot + // be exactly represented as f64) + if right_f >= i64::MAX as f64 { + return Ok(Ordering::Less); + } + + // Now right_f is in (i64::MIN, i64::MAX), so `right_f as i64` is + // well-defined (truncation toward 0) + let right_as_i = right_f as i64; + + let result = match left_i.cmp(&right_as_i) { + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, + Ordering::Equal => { + // they have the same integer part, compare the fraction + let rem = right_f - (right_as_i as f64); + if rem == 0.0 { + Ordering::Equal + } else if right_f > 0.0 { + Ordering::Less + } else { + Ordering::Greater + } + } + }; + Ok(result) + } + + pub fn cmp_u64_f64(left_u: u64, right_f: f64) -> Result { + if right_f.is_nan() { + return Err("NaN comparison is not supported".to_string()); + } + + // Negative floats are always less than any u64 >= 0 + if right_f < 0.0 { + return Ok(Ordering::Greater); + } + + // If right_f is >= u64::MAX then left_u < right_f (u64::MAX=2^64-1 cannot be exactly) + let max_as_f = u64::MAX as f64; + if right_f > max_as_f { + return Ok(Ordering::Less); + } + + // Now right_f is in (0, u64::MAX), so `right_f as u64` is well-defined + // (truncation toward 0) + let right_as_u = right_f as u64; + + let result = match left_u.cmp(&right_as_u) { + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, + Ordering::Equal => { + // they have the same integer part, compare the fraction + let rem = right_f - (right_as_u as f64); + if rem == 0.0 { + Ordering::Equal + } else { + Ordering::Less + } + } + }; + Ok(result) + } + + pub fn cmp_i64_u64(left_i: i64, right_u: u64) -> Ordering { + if left_i < 0 { + Ordering::Less + } else { + let left_as_u = left_i as u64; + left_as_u.cmp(&right_u) + } + } +} + +/// This modules helps projecting numerical values to other numerical types. +/// When the target value space cannot exactly represent the source value, the +/// next representable value is returned (or AfterLast if the source value is +/// larger than the largest representable value). +/// +/// All functions in this module assume that f64 values are not NaN. +pub mod num_proj { + #[derive(Debug, PartialEq)] + pub enum ProjectedNumber { + Exact(T), + Next(T), + AfterLast, + } + + pub fn i64_to_u64(value: i64) -> ProjectedNumber { + if value < 0 { + ProjectedNumber::Next(0) + } else { + ProjectedNumber::Exact(value as u64) + } + } + + pub fn u64_to_i64(value: u64) -> ProjectedNumber { + if value > i64::MAX as u64 { + ProjectedNumber::AfterLast + } else { + ProjectedNumber::Exact(value as i64) + } + } + + pub fn f64_to_u64(value: f64) -> ProjectedNumber { + if value < 0.0 { + ProjectedNumber::Next(0) + } else if value > u64::MAX as f64 { + ProjectedNumber::AfterLast + } else if value.fract() == 0.0 { + ProjectedNumber::Exact(value as u64) + } else { + // casting f64 to u64 truncates toward zero + ProjectedNumber::Next(value as u64 + 1) + } + } + + pub fn f64_to_i64(value: f64) -> ProjectedNumber { + if value < (i64::MIN as f64) { + ProjectedNumber::Next(i64::MIN) + } else if value >= (i64::MAX as f64) { + ProjectedNumber::AfterLast + } else if value.fract() == 0.0 { + ProjectedNumber::Exact(value as i64) + } else if value > 0.0 { + // casting f64 to i64 truncates toward zero + ProjectedNumber::Next(value as i64 + 1) + } else { + ProjectedNumber::Next(value as i64) + } + } + + pub fn i64_to_f64(value: i64) -> ProjectedNumber { + let value_f = value as f64; + let k_roundtrip = value_f as i64; + if k_roundtrip == value { + // between -2^53 and 2^53 all i64 are exactly represented as f64 + ProjectedNumber::Exact(value_f) + } else { + // for very large/small i64 values, it is approximated to the closest f64 + if k_roundtrip > value { + ProjectedNumber::Next(value_f) + } else { + ProjectedNumber::Next(value_f.next_up()) + } + } + } + + pub fn u64_to_f64(value: u64) -> ProjectedNumber { + let value_f = value as f64; + let k_roundtrip = value_f as u64; + if k_roundtrip == value { + // between 0 and 2^53 all u64 are exactly represented as f64 + ProjectedNumber::Exact(value_f) + } else if k_roundtrip > value { + ProjectedNumber::Next(value_f) + } else { + ProjectedNumber::Next(value_f.next_up()) + } + } +} + +#[cfg(test)] +mod num_cmp_tests { + use std::cmp::Ordering; + + use super::num_cmp::*; + + #[test] + fn test_cmp_u64_f64() { + // Basic comparisons + assert_eq!(cmp_u64_f64(5, 5.0).unwrap(), Ordering::Equal); + assert_eq!(cmp_u64_f64(5, 6.0).unwrap(), Ordering::Less); + assert_eq!(cmp_u64_f64(6, 5.0).unwrap(), Ordering::Greater); + assert_eq!(cmp_u64_f64(0, 0.0).unwrap(), Ordering::Equal); + assert_eq!(cmp_u64_f64(0, 0.1).unwrap(), Ordering::Less); + + // Negative float values should always be less than any u64 + assert_eq!(cmp_u64_f64(0, -0.1).unwrap(), Ordering::Greater); + assert_eq!(cmp_u64_f64(5, -5.0).unwrap(), Ordering::Greater); + assert_eq!(cmp_u64_f64(u64::MAX, -1e20).unwrap(), Ordering::Greater); + + // Tests with extreme values + assert_eq!(cmp_u64_f64(u64::MAX, 1e20).unwrap(), Ordering::Less); + + // Precision edge cases: large u64 that loses precision when converted to f64 + // => 2^54, exactly represented as f64 + let large_f64 = 18_014_398_509_481_984.0; + let large_u64 = 18_014_398_509_481_984; + // prove that large_u64 is exactly represented as f64 + assert_eq!(large_u64 as f64, large_f64); + assert_eq!(cmp_u64_f64(large_u64, large_f64).unwrap(), Ordering::Equal); + // => (2^54 + 1) cannot be exactly represented in f64 + let large_u64_plus_1 = 18_014_398_509_481_985; + // prove that it is represented as f64 by large_f64 + assert_eq!(large_u64_plus_1 as f64, large_f64); + assert_eq!( + cmp_u64_f64(large_u64_plus_1, large_f64).unwrap(), + Ordering::Greater + ); + // => (2^54 - 1) cannot be exactly represented in f64 + let large_u64_minus_1 = 18_014_398_509_481_983; + // prove that it is also represented as f64 by large_f64 + assert_eq!(large_u64_minus_1 as f64, large_f64); + assert_eq!( + cmp_u64_f64(large_u64_minus_1, large_f64).unwrap(), + Ordering::Less + ); + + // NaN comparison results in an error + assert!(cmp_u64_f64(0, f64::NAN).is_err()); + } + + #[test] + fn test_cmp_i64_f64() { + // Basic comparisons + assert_eq!(cmp_i64_f64(5, 5.0).unwrap(), Ordering::Equal); + assert_eq!(cmp_i64_f64(5, 6.0).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(6, 5.0).unwrap(), Ordering::Greater); + assert_eq!(cmp_i64_f64(-5, -5.0).unwrap(), Ordering::Equal); + assert_eq!(cmp_i64_f64(-5, -4.0).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(-4, -5.0).unwrap(), Ordering::Greater); + assert_eq!(cmp_i64_f64(-5, 5.0).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(5, -5.0).unwrap(), Ordering::Greater); + assert_eq!(cmp_i64_f64(0, -0.1).unwrap(), Ordering::Greater); + assert_eq!(cmp_i64_f64(0, 0.1).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(-1, -0.5).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(-1, 0.0).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(0, 0.0).unwrap(), Ordering::Equal); + + // Tests with extreme values + assert_eq!(cmp_i64_f64(i64::MAX, 1e20).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(i64::MIN, -1e20).unwrap(), Ordering::Greater); + + // Precision edge cases: large i64 that loses precision when converted to f64 + // => 2^54, exactly represented as f64 + let large_f64 = 18_014_398_509_481_984.0; + let large_i64 = 18_014_398_509_481_984; + // prove that large_i64 is exactly represented as f64 + assert_eq!(large_i64 as f64, large_f64); + assert_eq!(cmp_i64_f64(large_i64, large_f64).unwrap(), Ordering::Equal); + // => (1_i64 << 54) + 1 cannot be exactly represented in f64 + let large_i64_plus_1 = 18_014_398_509_481_985; + // prove that it is represented as f64 by large_f64 + assert_eq!(large_i64_plus_1 as f64, large_f64); + assert_eq!( + cmp_i64_f64(large_i64_plus_1, large_f64).unwrap(), + Ordering::Greater + ); + // => (1_i64 << 54) - 1 cannot be exactly represented in f64 + let large_i64_minus_1 = 18_014_398_509_481_983; + // prove that it is also represented as f64 by large_f64 + assert_eq!(large_i64_minus_1 as f64, large_f64); + assert_eq!( + cmp_i64_f64(large_i64_minus_1, large_f64).unwrap(), + Ordering::Less + ); + + // Same precision edge case but with negative values + // => -2^54, exactly represented as f64 + let large_neg_f64 = -18_014_398_509_481_984.0; + let large_neg_i64 = -18_014_398_509_481_984; + // prove that large_neg_i64 is exactly represented as f64 + assert_eq!(large_neg_i64 as f64, large_neg_f64); + assert_eq!( + cmp_i64_f64(large_neg_i64, large_neg_f64).unwrap(), + Ordering::Equal + ); + // => (-2^54 + 1) cannot be exactly represented in f64 + let large_neg_i64_plus_1 = -18_014_398_509_481_985; + // prove that it is represented as f64 by large_neg_f64 + assert_eq!(large_neg_i64_plus_1 as f64, large_neg_f64); + assert_eq!( + cmp_i64_f64(large_neg_i64_plus_1, large_neg_f64).unwrap(), + Ordering::Less + ); + // => (-2^54 - 1) cannot be exactly represented in f64 + let large_neg_i64_minus_1 = -18_014_398_509_481_983; + // prove that it is also represented as f64 by large_neg_f64 + assert_eq!(large_neg_i64_minus_1 as f64, large_neg_f64); + assert_eq!( + cmp_i64_f64(large_neg_i64_minus_1, large_neg_f64).unwrap(), + Ordering::Greater + ); + + // NaN comparison results in an error + assert!(cmp_i64_f64(0, f64::NAN).is_err()); + } + + #[test] + fn test_cmp_i64_u64() { + // Test with negative i64 values (should always be less than any u64) + assert_eq!(cmp_i64_u64(-1, 0), Ordering::Less); + assert_eq!(cmp_i64_u64(i64::MIN, 0), Ordering::Less); + assert_eq!(cmp_i64_u64(i64::MIN, u64::MAX), Ordering::Less); + + // Test with positive i64 values + assert_eq!(cmp_i64_u64(0, 0), Ordering::Equal); + assert_eq!(cmp_i64_u64(1, 0), Ordering::Greater); + assert_eq!(cmp_i64_u64(1, 1), Ordering::Equal); + assert_eq!(cmp_i64_u64(0, 1), Ordering::Less); + assert_eq!(cmp_i64_u64(5, 10), Ordering::Less); + assert_eq!(cmp_i64_u64(10, 5), Ordering::Greater); + + // Test with values near i64::MAX and u64 conversion + assert_eq!(cmp_i64_u64(i64::MAX, i64::MAX as u64), Ordering::Equal); + assert_eq!(cmp_i64_u64(i64::MAX, (i64::MAX as u64) + 1), Ordering::Less); + assert_eq!(cmp_i64_u64(i64::MAX, u64::MAX), Ordering::Less); + } +} + +#[cfg(test)] +mod num_proj_tests { + use super::num_proj::{self, ProjectedNumber}; + + #[test] + fn test_i64_to_u64() { + assert_eq!(num_proj::i64_to_u64(-1), ProjectedNumber::Next(0)); + assert_eq!(num_proj::i64_to_u64(i64::MIN), ProjectedNumber::Next(0)); + assert_eq!(num_proj::i64_to_u64(0), ProjectedNumber::Exact(0)); + assert_eq!(num_proj::i64_to_u64(42), ProjectedNumber::Exact(42)); + assert_eq!( + num_proj::i64_to_u64(i64::MAX), + ProjectedNumber::Exact(i64::MAX as u64) + ); + } + + #[test] + fn test_u64_to_i64() { + assert_eq!(num_proj::u64_to_i64(0), ProjectedNumber::Exact(0)); + assert_eq!(num_proj::u64_to_i64(42), ProjectedNumber::Exact(42)); + assert_eq!( + num_proj::u64_to_i64(i64::MAX as u64), + ProjectedNumber::Exact(i64::MAX) + ); + assert_eq!( + num_proj::u64_to_i64((i64::MAX as u64) + 1), + ProjectedNumber::AfterLast + ); + assert_eq!(num_proj::u64_to_i64(u64::MAX), ProjectedNumber::AfterLast); + } + + #[test] + fn test_f64_to_u64() { + assert_eq!(num_proj::f64_to_u64(-1e25), ProjectedNumber::Next(0)); + assert_eq!(num_proj::f64_to_u64(-0.1), ProjectedNumber::Next(0)); + assert_eq!(num_proj::f64_to_u64(1e20), ProjectedNumber::AfterLast); + assert_eq!( + num_proj::f64_to_u64(f64::INFINITY), + ProjectedNumber::AfterLast + ); + assert_eq!(num_proj::f64_to_u64(0.0), ProjectedNumber::Exact(0)); + assert_eq!(num_proj::f64_to_u64(42.0), ProjectedNumber::Exact(42)); + assert_eq!(num_proj::f64_to_u64(0.5), ProjectedNumber::Next(1)); + assert_eq!(num_proj::f64_to_u64(42.1), ProjectedNumber::Next(43)); + } + + #[test] + fn test_f64_to_i64() { + assert_eq!(num_proj::f64_to_i64(-1e20), ProjectedNumber::Next(i64::MIN)); + assert_eq!( + num_proj::f64_to_i64(f64::NEG_INFINITY), + ProjectedNumber::Next(i64::MIN) + ); + assert_eq!(num_proj::f64_to_i64(1e20), ProjectedNumber::AfterLast); + assert_eq!( + num_proj::f64_to_i64(f64::INFINITY), + ProjectedNumber::AfterLast + ); + assert_eq!(num_proj::f64_to_i64(0.0), ProjectedNumber::Exact(0)); + assert_eq!(num_proj::f64_to_i64(42.0), ProjectedNumber::Exact(42)); + assert_eq!(num_proj::f64_to_i64(-42.0), ProjectedNumber::Exact(-42)); + assert_eq!(num_proj::f64_to_i64(0.5), ProjectedNumber::Next(1)); + assert_eq!(num_proj::f64_to_i64(42.1), ProjectedNumber::Next(43)); + assert_eq!(num_proj::f64_to_i64(-0.5), ProjectedNumber::Next(0)); + assert_eq!(num_proj::f64_to_i64(-42.1), ProjectedNumber::Next(-42)); + } + + #[test] + fn test_i64_to_f64() { + assert_eq!(num_proj::i64_to_f64(0), ProjectedNumber::Exact(0.0)); + assert_eq!(num_proj::i64_to_f64(42), ProjectedNumber::Exact(42.0)); + assert_eq!(num_proj::i64_to_f64(-42), ProjectedNumber::Exact(-42.0)); + + let max_exact = 9_007_199_254_740_992; // 2^53 + assert_eq!( + num_proj::i64_to_f64(max_exact), + ProjectedNumber::Exact(max_exact as f64) + ); + + // Test values that cannot be exactly represented as f64 (integers above 2^53) + let large_i64 = 9_007_199_254_740_993; // 2^53 + 1 + let closest_f64 = 9_007_199_254_740_992.0; + assert_eq!(large_i64 as f64, closest_f64); + if let ProjectedNumber::Next(val) = num_proj::i64_to_f64(large_i64) { + // Verify that the returned float is different from the direct cast + assert!(val > closest_f64); + assert!(val - closest_f64 < 2. * f64::EPSILON * closest_f64); + } else { + panic!("Expected ProjectedNumber::Next for large_i64"); + } + + // Test with very large negative value + let large_neg_i64 = -9_007_199_254_740_993; // -(2^53 + 1) + let closest_neg_f64 = -9_007_199_254_740_992.0; + assert_eq!(large_neg_i64 as f64, closest_neg_f64); + if let ProjectedNumber::Next(val) = num_proj::i64_to_f64(large_neg_i64) { + // Verify that the returned float is the closest representable f64 + assert_eq!(val, closest_neg_f64); + } else { + panic!("Expected ProjectedNumber::Next for large_neg_i64"); + } + } + + #[test] + fn test_u64_to_f64() { + assert_eq!(num_proj::u64_to_f64(0), ProjectedNumber::Exact(0.0)); + assert_eq!(num_proj::u64_to_f64(42), ProjectedNumber::Exact(42.0)); + + // Test the largest u64 value that can be exactly represented as f64 (2^53) + let max_exact = 9_007_199_254_740_992; // 2^53 + assert_eq!( + num_proj::u64_to_f64(max_exact), + ProjectedNumber::Exact(max_exact as f64) + ); + + // Test values that cannot be exactly represented as f64 (integers above 2^53) + let large_u64 = 9_007_199_254_740_993; // 2^53 + 1 + let closest_f64 = 9_007_199_254_740_992.0; + assert_eq!(large_u64 as f64, closest_f64); + if let ProjectedNumber::Next(val) = num_proj::u64_to_f64(large_u64) { + // Verify that the returned float is different from the direct cast + assert!(val > closest_f64); + assert!(val - closest_f64 < 2. * f64::EPSILON * closest_f64); + } else { + panic!("Expected ProjectedNumber::Next for large_u64"); + } + } +} diff --git a/quickwit/quickwit-common/src/rate_limited_tracing.rs b/quickwit/quickwit-common/src/rate_limited_tracing.rs index c9a323f9ec2..198c2bf8bdd 100644 --- a/quickwit/quickwit-common/src/rate_limited_tracing.rs +++ b/quickwit/quickwit-common/src/rate_limited_tracing.rs @@ -179,12 +179,13 @@ fn _check_macro_works() { #[doc(hidden)] pub use coarsetime::Instant as CoarsetimeInstant; +pub use rate_limited_debug; +pub use rate_limited_error; +pub use rate_limited_info; +pub use rate_limited_trace; #[doc(hidden)] pub use rate_limited_tracing; -pub use { - rate_limited_debug, rate_limited_error, rate_limited_info, rate_limited_trace, - rate_limited_warn, -}; +pub use rate_limited_warn; #[cfg(test)] mod tests { diff --git a/quickwit/quickwit-config/src/index_config/mod.rs b/quickwit/quickwit-config/src/index_config/mod.rs index e6e7adb3766..1f8af60aa57 100644 --- a/quickwit/quickwit-config/src/index_config/mod.rs +++ b/quickwit/quickwit-config/src/index_config/mod.rs @@ -487,6 +487,7 @@ impl crate::TestableForRegression for IndexConfig { ], timestamp_field: Some("timestamp".to_string()), secondary_timestamp_field: None, + indexation_time_field: None, tag_fields: BTreeSet::from_iter(["tenant_id".to_string(), "log_level".to_string()]), partition_key: Some("tenant_id".to_string()), max_num_partitions: NonZeroU32::new(100).unwrap(), diff --git a/quickwit/quickwit-config/src/node_config/mod.rs b/quickwit/quickwit-config/src/node_config/mod.rs index 31e19bce09c..0d70d0a2b77 100644 --- a/quickwit/quickwit-config/src/node_config/mod.rs +++ b/quickwit/quickwit-config/src/node_config/mod.rs @@ -59,6 +59,9 @@ pub struct RestConfig { pub struct GrpcConfig { #[serde(default = "GrpcConfig::default_max_message_size")] pub max_message_size: ByteSize, + /// Search server responses can be larger when returning many hits. + #[serde(default = "GrpcConfig::default_max_search_message_size")] + pub max_search_message_size: ByteSize, #[serde(default)] pub tls: Option, // If set, keeps idle connection alive by periodically perform a @@ -104,6 +107,10 @@ impl GrpcConfig { ByteSize::mib(20) } + fn default_max_search_message_size() -> ByteSize { + ByteSize::mib(60) + } + pub fn validate(&self) -> anyhow::Result<()> { ensure!( self.max_message_size >= ByteSize::mb(1), @@ -118,6 +125,7 @@ impl Default for GrpcConfig { fn default() -> Self { Self { max_message_size: Self::default_max_message_size(), + max_search_message_size: Self::default_max_search_message_size(), tls: None, keep_alive: None, } @@ -846,6 +854,7 @@ mod tests { fn test_grpc_config_validate() { let grpc_config = GrpcConfig { max_message_size: ByteSize::mb(1), + max_search_message_size: ByteSize::mb(1), tls: None, keep_alive: None, }; @@ -853,6 +862,7 @@ mod tests { let grpc_config = GrpcConfig { max_message_size: ByteSize::kb(1), + max_search_message_size: ByteSize::kb(1), tls: None, keep_alive: None, }; diff --git a/quickwit/quickwit-config/src/storage_config.rs b/quickwit/quickwit-config/src/storage_config.rs index 52daffdb537..7a9af4b1cdf 100644 --- a/quickwit/quickwit-config/src/storage_config.rs +++ b/quickwit/quickwit-config/src/storage_config.rs @@ -425,6 +425,7 @@ impl fmt::Debug for S3StorageConfig { "disable_multi_object_delete", &self.disable_multi_object_delete, ) + .field("encryption", &self.encryption) .finish() } } diff --git a/quickwit/quickwit-control-plane/Cargo.toml b/quickwit/quickwit-control-plane/Cargo.toml index 2957c9858c4..63cd6138af7 100644 --- a/quickwit/quickwit-control-plane/Cargo.toml +++ b/quickwit/quickwit-control-plane/Cargo.toml @@ -13,6 +13,7 @@ license.workspace = true [dependencies] anyhow = { workspace = true } async-trait = { workspace = true } +base64 = { workspace = true } bytesize = { workspace = true } fnv = { workspace = true } futures = { workspace = true } @@ -20,10 +21,12 @@ itertools = { workspace = true } lru = { workspace = true } mockall = { workspace = true, optional = true } once_cell = { workspace = true } +prost = { workspace = true } rand = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } smallvec = { workspace = true } +time = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } ulid = { workspace = true } diff --git a/quickwit/quickwit-control-plane/src/control_plane.rs b/quickwit/quickwit-control-plane/src/control_plane.rs index e4c6995d639..ca801f5a034 100644 --- a/quickwit/quickwit-control-plane/src/control_plane.rs +++ b/quickwit/quickwit-control-plane/src/control_plane.rs @@ -40,7 +40,10 @@ use quickwit_ingest::{IngesterPool, LocalShardsUpdate}; use quickwit_metastore::{CreateIndexRequestExt, CreateIndexResponseExt, IndexMetadataResponseExt}; use quickwit_proto::control_plane::{ AdviseResetShardsRequest, AdviseResetShardsResponse, ControlPlaneError, ControlPlaneResult, + DisableMaintenanceModeRequest, DisableMaintenanceModeResponse, EnableMaintenanceModeRequest, + EnableMaintenanceModeResponse, GetMaintenanceModeRequest, GetMaintenanceModeResponse, GetOrCreateOpenShardsRequest, GetOrCreateOpenShardsResponse, GetOrCreateOpenShardsSubrequest, + SwapIndexingPipelinesRequest, SwapIndexingPipelinesResponse, }; use quickwit_proto::indexing::ShardPositionsUpdate; use quickwit_proto::metastore::{ @@ -62,6 +65,7 @@ use crate::debouncer::Debouncer; use crate::indexing_scheduler::{IndexingScheduler, IndexingSchedulerState}; use crate::ingest::IngestController; use crate::ingest::ingest_controller::{IngestControllerStats, RebalanceShardsCallback}; +use crate::maintenance::{MaintenanceState, MetastoreKvPersistence, serialize_frozen_plan}; use crate::model::ControlPlaneModel; /// Interval between two controls (or checks) of the desired plan VS running plan. @@ -102,6 +106,11 @@ pub struct ControlPlane { readiness_tx: watch::Sender, // Disables the control loop. This is useful for unit testing. disable_control_loop: bool, + /// Maintenance mode state. When active the indexing plan is frozen (not + /// rebuilt on topology changes). + maintenance: MaintenanceState, + /// Persistence backend for maintenance mode state (frozen plan + metadata). + maintenance_persistence: MetastoreKvPersistence, } impl fmt::Debug for ControlPlane { @@ -125,6 +134,7 @@ impl ControlPlane { watch::Receiver, ) { let disable_control_loop = false; + let maintenance_persistence = MetastoreKvPersistence::new(metastore.clone()); Self::spawn_inner( universe, cluster_config, @@ -134,6 +144,7 @@ impl ControlPlane { ingester_pool, metastore, disable_control_loop, + maintenance_persistence, ) } @@ -147,6 +158,7 @@ impl ControlPlane { ingester_pool: IngesterPool, metastore: MetastoreServiceClient, disable_control_loop: bool, + maintenance_persistence: MetastoreKvPersistence, ) -> ( Mailbox, ActorHandle>, @@ -186,6 +198,8 @@ impl ControlPlane { rebuild_plan_debouncer: Debouncer::new(REBUILD_PLAN_COOLDOWN_PERIOD), readiness_tx, disable_control_loop, + maintenance: MaintenanceState::default(), + maintenance_persistence: maintenance_persistence.clone(), } }); (control_plane_mailbox, control_plane_handle, readiness_rx) @@ -199,6 +213,7 @@ pub struct ControlPlaneObservableState { pub num_indexes: usize, pub num_sources: usize, pub readiness: bool, + pub maintenance_mode: bool, } #[async_trait] @@ -216,6 +231,7 @@ impl Actor for ControlPlane { num_indexes: self.model.num_indexes(), num_sources: self.model.num_sources(), readiness: *self.readiness_tx.borrow(), + maintenance_mode: self.maintenance.is_active(), } } @@ -227,7 +243,17 @@ impl Actor for ControlPlane { .await .context("failed to initialize control plane model")?; - let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + self.load_maintenance_state_from_persistence().await; + + if self.maintenance.is_active() { + // In maintenance mode: restore the frozen plan without triggering a rebuild. + info!( + enabled_at = self.maintenance.enabled_at().unwrap_or_default(), + "control plane starting in maintenance mode: indexing plan is frozen" + ); + } else { + let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + } self.ingest_controller.sync_with_all_ingesters(&self.model); @@ -245,6 +271,37 @@ impl Actor for ControlPlane { } impl ControlPlane { + /// Loads maintenance state from the persistence backend. + /// Called during `initialize()`. + async fn load_maintenance_state_from_persistence(&mut self) { + match self.maintenance_persistence.load().await { + Some(persisted) => { + self.maintenance.load_from_metadata(persisted.metadata); + if self.maintenance.is_active() { + crate::metrics::CONTROL_PLANE_METRICS + .maintenance_mode + .set(1); + let num_indexers = persisted.frozen_plan.num_indexers(); + let num_pipelines: usize = persisted + .frozen_plan + .indexing_tasks_per_indexer() + .values() + .map(|tasks| tasks.len()) + .sum(); + info!( + num_indexers, + num_pipelines, "restored frozen indexing plan from persistence" + ); + self.indexing_scheduler + .load_frozen_plan(persisted.frozen_plan); + } + } + None => { + // No maintenance state persisted — normal operation. + } + } + } + async fn auto_create_indexes( &mut self, subrequests: &[GetOrCreateOpenShardsSubrequest], @@ -353,7 +410,7 @@ impl ControlPlane { let physical_indexing_plan: Vec = self .indexing_scheduler .observable_state() - .last_applied_physical_plan + .current_targeted_physical_plan .map(|plan| { plan.indexing_tasks_per_indexer() .iter() @@ -427,7 +484,8 @@ impl Handler for ControlPlane { _message: RebuildPlan, _ctx: &ActorContext, ) -> Result<(), ActorExitStatus> { - self.indexing_scheduler.rebuild_plan(&self.model); + self.indexing_scheduler + .rebuild_plan(&self.model, self.maintenance.is_active()); Ok(()) } } @@ -508,14 +566,21 @@ impl Handler for ControlPlane { if self.disable_control_loop { return Ok(()); } + let is_maintenance = self.maintenance.is_active(); if let Err(metastore_error) = self .ingest_controller - .rebalance_shards(&mut self.model, ctx.mailbox(), ctx.progress()) + .rebalance_shards( + &mut self.model, + ctx.mailbox(), + ctx.progress(), + is_maintenance, + ) .await { return convert_metastore_error::<()>(metastore_error).map(|_| ()); } - self.indexing_scheduler.control_running_plan(&self.model); + self.indexing_scheduler + .control_running_plan(&self.model, is_maintenance); ctx.schedule_self_msg(CONTROL_PLAN_LOOP_INTERVAL, ControlPlanLoop); Ok(()) } @@ -596,7 +661,8 @@ impl DeferableReplyHandler for ControlPlane { // Now, create index can also add sources to support creating indexes automatically from // index and source config templates. - let should_rebuild_plan = !index_metadata.sources.is_empty(); + let should_rebuild_plan = + !index_metadata.sources.is_empty() && !self.maintenance.is_active(); self.model.add_index(index_metadata); if should_rebuild_plan { @@ -646,6 +712,7 @@ impl Handler for ControlPlane { if self .model .update_index_config(&index_uid, index_metadata.index_config)? + && !self.maintenance.is_active() { let _rebuild_plan_notifier = self.rebuild_plan_debounced(ctx); } @@ -688,7 +755,9 @@ impl Handler for ControlPlane { // TODO: Refine the event. Notify index will have the effect to reload the entire state from // the metastore. We should update the state of the control plane. - let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + if !self.maintenance.is_active() { + let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + } info!(%index_uid, "deleted index"); let response = EmptyResponse {}; @@ -730,7 +799,9 @@ impl Handler for ControlPlane { // TODO: Refine the event. Notify index will have the effect to reload the entire state from // the metastore. We should update the state of the control plane. - let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + if !self.maintenance.is_active() { + let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + } let response = EmptyResponse {}; Ok(Ok(response)) @@ -770,7 +841,9 @@ impl Handler for ControlPlane { // TODO: Refine the event. Notify index will have the effect to reload the entire state from // the metastore. We should update the state of the control plane. - let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + if !self.maintenance.is_active() { + let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + } info!(%index_uid, source_id, "updated source"); let response = EmptyResponse {}; @@ -806,7 +879,7 @@ impl Handler for ControlPlane { .toggle_source(&index_uid, &source_id, enable) .context("failed to toggle source")?; - if mutation_occurred { + if mutation_occurred && !self.maintenance.is_active() { let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); } info!(%index_uid, source_id, enabled=enable, "toggled source"); @@ -861,7 +934,9 @@ impl Handler for ControlPlane { .sync_with_ingesters(&ingesters_needing_resync, &self.model); self.model.delete_source(&source_uid); - let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + if !self.maintenance.is_active() { + let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + } info!( index_uid=%source_uid.index_uid, @@ -916,9 +991,12 @@ impl Handler for ControlPlane { request: GetOrCreateOpenShardsRequest, ctx: &ActorContext, ) -> Result { - if let Err(metastore_error) = self - .auto_create_indexes(&request.subrequests, ctx.progress()) - .await + // In maintenance mode, block auto-create indexes but still allow shard routing + // for existing sources (ingest must continue). + if !self.maintenance.is_active() + && let Err(metastore_error) = self + .auto_create_indexes(&request.subrequests, ctx.progress()) + .await { return convert_metastore_error(metastore_error); } @@ -953,6 +1031,20 @@ impl Handler for ControlPlane { } } +#[async_trait] +impl Handler for ControlPlane { + type Reply = ControlPlaneResult; + + async fn handle( + &mut self, + request: SwapIndexingPipelinesRequest, + _ctx: &ActorContext, + ) -> Result { + let response = self.indexing_scheduler.swap_pipelines(request); + Ok(response) + } +} + #[async_trait] impl Handler for ControlPlane { type Reply = ControlPlaneResult<()>; @@ -962,6 +1054,11 @@ impl Handler for ControlPlane { local_shards_update: LocalShardsUpdate, ctx: &ActorContext, ) -> Result { + if self.maintenance.is_active() { + // In maintenance mode: skip shard scaling to avoid changing the plan. + debug!("maintenance mode: ignoring local shards update (scaling frozen)"); + return Ok(Ok(())); + } if let Err(metastore_error) = self .ingest_controller .handle_local_shards_update(local_shards_update, &mut self.model, ctx.progress()) @@ -1053,19 +1150,34 @@ impl Handler for ControlPlane { message: IndexerJoined, ctx: &ActorContext, ) -> Result { - info!( - "indexer `{}` joined the cluster: rebalancing shards and rebuilding indexing plan", - message.0.node_id() - ); + let is_maintenance = self.maintenance.is_active(); + if is_maintenance { + info!( + "indexer `{}` joined the cluster during maintenance mode", + message.0.node_id() + ); + } else { + info!( + "indexer `{}` joined the cluster: rebalancing shards and rebuilding indexing plan", + message.0.node_id() + ); + } + // TODO: Update shard table. if let Err(metastore_error) = self .ingest_controller - .rebalance_shards(&mut self.model, ctx.mailbox(), ctx.progress()) + .rebalance_shards( + &mut self.model, + ctx.mailbox(), + ctx.progress(), + is_maintenance, + ) .await { return convert_metastore_error::<()>(metastore_error).map(|_| ()); } - self.indexing_scheduler.rebuild_plan(&self.model); + self.indexing_scheduler + .rebuild_plan(&self.model, is_maintenance); Ok(()) } } @@ -1083,125 +1195,817 @@ impl Handler for ControlPlane { message: IndexerLeft, ctx: &ActorContext, ) -> Result { - info!( - "indexer `{}` left the cluster: rebalancing shards and rebuilding indexing plan", - message.0.node_id() - ); + let is_maintenance = self.maintenance.is_active(); + if is_maintenance { + info!( + "indexer `{}` left the cluster during maintenance mode", + message.0.node_id() + ); + return Ok(()); + } else { + info!( + "indexer `{}` left the cluster: rebalancing shards and rebuilding indexing plan", + message.0.node_id() + ); + } // TODO: Update shard table. if let Err(metastore_error) = self .ingest_controller - .rebalance_shards(&mut self.model, ctx.mailbox(), ctx.progress()) + .rebalance_shards( + &mut self.model, + ctx.mailbox(), + ctx.progress(), + is_maintenance, + ) + .await + { + return convert_metastore_error::<()>(metastore_error).map(|_| ()); + } + self.indexing_scheduler + .rebuild_plan(&self.model, is_maintenance); + Ok(()) + } +} + +#[async_trait] +impl Handler for ControlPlane { + type Reply = (); + + async fn handle( + &mut self, + message: RebalanceShardsCallback, + _ctx: &ActorContext, + ) -> Result { + let num_closed_shards = message.closed_shards.len(); + debug!("closing {num_closed_shards} shards after rebalance"); + + for closed_shard in message.closed_shards { + let shard_id = closed_shard.shard_id().clone(); + let source_uid = SourceUid { + index_uid: closed_shard.index_uid().clone(), + source_id: closed_shard.source_id, + }; + self.model.close_shards(&source_uid, &[shard_id]); + } + // We drop the rebalance guard explicitly here to put some emphasis on where a the rebalance + // lock is released. + drop(message.rebalance_guard); + Ok(()) + } +} + +// -- Maintenance Mode Handlers -- + +#[async_trait] +impl Handler for ControlPlane { + type Reply = ControlPlaneResult; + + async fn handle( + &mut self, + request: EnableMaintenanceModeRequest, + _ctx: &ActorContext, + ) -> Result { + self.handle_enable_maintenance(request).await + } +} + +#[async_trait] +impl Handler for ControlPlane { + type Reply = ControlPlaneResult; + + async fn handle( + &mut self, + _request: DisableMaintenanceModeRequest, + _ctx: &ActorContext, + ) -> Result { + self.handle_disable_maintenance().await + } +} + +#[async_trait] +impl Handler for ControlPlane { + type Reply = ControlPlaneResult; + + async fn handle( + &mut self, + _request: GetMaintenanceModeRequest, + _ctx: &ActorContext, + ) -> Result { + self.handle_get_maintenance() + } +} + +impl ControlPlane { + async fn handle_enable_maintenance( + &mut self, + _request: EnableMaintenanceModeRequest, + ) -> Result, ActorExitStatus> { + if self.maintenance.is_active() { + return Ok(Err(ControlPlaneError::Internal( + "maintenance mode is already enabled".to_string(), + ))); + } + + // Freeze the current plan. + let frozen_plan = self + .indexing_scheduler + .observable_state() + .current_targeted_physical_plan + .unwrap_or_else(|| crate::indexing_plan::PhysicalIndexingPlan::with_indexer_ids(&[])); + + let frozen_plan_json = match serialize_frozen_plan(&frozen_plan) { + Ok(json) => json, + Err(err) => { + return Ok(Err(ControlPlaneError::Internal(format!( + "failed to serialize frozen plan: {err}" + )))); + } + }; + + // Build the metadata (with RFC 3339 datetime). + let metadata = crate::maintenance::MaintenanceModeMetadata::new_now(); + + // Persist to durable storage BEFORE enabling in-memory state. + // This ensures that on restart, the control plane will find the persisted state + // even if it crashes right after this point. + if let Err(err) = self + .maintenance_persistence + .save(&metadata, &frozen_plan) + .await + { + return Ok(Err(ControlPlaneError::Internal(format!( + "failed to persist maintenance state: {err}" + )))); + } + + // Only now enable in-memory state (persistence succeeded). + self.maintenance.load_from_metadata(metadata); + crate::metrics::CONTROL_PLANE_METRICS + .maintenance_mode + .set(1); + + info!( + num_indexers = frozen_plan.num_indexers(), + "maintenance mode enabled: indexing plan frozen" + ); + + Ok(Ok(EnableMaintenanceModeResponse { frozen_plan_json })) + } + + async fn handle_disable_maintenance( + &mut self, + ) -> Result, ActorExitStatus> { + if !self.maintenance.is_active() { + return Ok(Err(ControlPlaneError::Internal( + "maintenance mode is not currently enabled".to_string(), + ))); + } + + // Clear persisted state BEFORE disabling in-memory. + // This ensures that on restart, the control plane will NOT reload maintenance mode + // even if it crashes right after this point. + if let Err(err) = self.maintenance_persistence.clear().await { + return Ok(Err(ControlPlaneError::Internal(format!( + "failed to clear persisted maintenance state: {err}" + )))); + } + + // Only now disable in-memory state (persistence clear succeeded). + self.maintenance.disable(); + crate::metrics::CONTROL_PLANE_METRICS + .maintenance_mode + .set(0); + + // Trigger a full plan rebuild to reconcile the cluster. + info!("maintenance mode disabled: triggering full indexing plan rebuild"); + self.indexing_scheduler.rebuild_plan(&self.model, false); + + Ok(Ok(DisableMaintenanceModeResponse {})) + } + + fn handle_get_maintenance( + &self, + ) -> Result, ActorExitStatus> { + let is_maintenance_mode = self.maintenance.is_active(); + let enabled_at = self.maintenance.enabled_at(); + + Ok(Ok(GetMaintenanceModeResponse { + is_maintenance_mode, + enabled_at, + })) + } +} + +fn spawn_watch_indexers_task( + weak_mailbox: WeakMailbox, + cluster_change_stream: ClusterChangeStream, +) { + tokio::spawn(watcher_indexers(weak_mailbox, cluster_change_stream)); +} + +async fn watcher_indexers( + weak_mailbox: WeakMailbox, + mut cluster_change_stream: ClusterChangeStream, +) { + while let Some(cluster_change) = cluster_change_stream.next().await { + let Some(mailbox) = weak_mailbox.upgrade() else { + return; + }; + match cluster_change { + ClusterChange::Add(node) => { + if node.enabled_services().contains(&QuickwitService::Indexer) + && let Err(error) = mailbox.send_message(IndexerJoined(node)).await + { + error!(%error, "failed to forward `IndexerJoined` event to control plane"); + } + } + ClusterChange::Remove(node) => { + if node.enabled_services().contains(&QuickwitService::Indexer) + && let Err(error) = mailbox.send_message(IndexerLeft(node)).await + { + error!(%error, "failed to forward `IndexerLeft` event to control plane"); + } + } + ClusterChange::Update(_) => { + // We are not interested in updates (yet). + } + } + } +} + +#[cfg(test)] +mod tests { + use std::num::NonZero; + use std::sync::Arc; + + use futures::FutureExt; + use mockall::Sequence; + use quickwit_actors::{AskError, Observe, SupervisorMetrics}; + use quickwit_cluster::ClusterChangeStreamFactoryForTest; + use quickwit_common::test_utils::wait_until_predicate; + use quickwit_config::{ + CLI_SOURCE_ID, INGEST_V2_SOURCE_ID, IndexConfig, KafkaSourceParams, SourceParams, + }; + use quickwit_indexing::IndexingService; + use quickwit_metastore::{ + CreateIndexRequestExt, IndexMetadata, ListIndexesMetadataResponseExt, + }; + use quickwit_proto::control_plane::{ + GetOrCreateOpenShardsFailureReason, GetOrCreateOpenShardsSubrequest, + SwapIndexingPipelinesEntry, + }; + use quickwit_proto::indexing::{ + ApplyIndexingPlanRequest, ApplyIndexingPlanResponse, CpuCapacity, IndexingServiceClient, + MockIndexingService, + }; + use quickwit_proto::ingest::ingester::{ + IngesterServiceClient, InitShardSuccess, InitShardsResponse, MockIngesterService, + RetainShardsResponse, + }; + use quickwit_proto::ingest::{Shard, ShardPKey, ShardState}; + use quickwit_proto::metastore::{ + DeleteShardsResponse, EmptyResponse, EntityKind, FindIndexTemplateMatchesResponse, + GetKvResponse, ListIndexesMetadataRequest, ListIndexesMetadataResponse, ListShardsRequest, + ListShardsResponse, ListShardsSubresponse, MetastoreError, MockMetastoreService, + OpenShardSubresponse, OpenShardsResponse, SourceType, + }; + use quickwit_proto::types::{DocMappingUid, Position}; + use tokio::sync::Mutex; + + use super::*; + use crate::IndexerNodeInfo; + use crate::indexing_plan::PhysicalIndexingPlan; + use crate::maintenance::MetastoreKvPersistence; + + fn setup_disabled_maintenance(mock_metastore: &mut MockMetastoreService) { + mock_metastore + .expect_get_kv() + .returning(|_| Ok(GetKvResponse { value: None })); + } + + fn setup_maintenance_enable(mock_metastore: &mut MockMetastoreService) { + mock_metastore + .expect_get_kv() + .return_once(|_| Ok(GetKvResponse { value: None })); + mock_metastore + .expect_set_kv() + .return_once(|_| Ok(EmptyResponse {})); + } + + async fn observe_current_plan( + control_plane_handle: &ActorHandle>, + ) -> Option { + control_plane_handle + .observe() + .await + .state_opt + .as_ref()? + .indexing_scheduler + .current_targeted_physical_plan + .clone() + } + + #[must_use] + fn add_test_indexer_with_mailbox( + universe: &Universe, + indexer_pool: &IndexerPool, + node_id: NodeId, + ) -> quickwit_actors::Inbox { + let (client_mailbox, client_inbox) = universe.create_test_mailbox(); + let client = IndexingServiceClient::from_mailbox::(client_mailbox); + let indexer_info = IndexerNodeInfo { + node_id: node_id.clone(), + generation_id: 0, + client, + indexing_tasks: Vec::new(), + indexing_capacity: CpuCapacity::from_cpu_millis(4_000), + }; + indexer_pool.insert(node_id, indexer_info); + client_inbox + } + + #[tokio::test] + async fn test_maintenance_mode_allows_create_index_without_rebuild() { + let universe = Universe::with_accelerated_time(); + + let indexer_pool = IndexerPool::default(); + + // Add one indexer to the pool + let node_1: NodeId = "test-node-1".into(); + let _indexing_inbox_1 = + add_test_indexer_with_mailbox(&universe, &indexer_pool, node_1.clone()); + + let ingester_pool = IngesterPool::default(); + + let index_uid: IndexUid = IndexUid::for_test("test-index", 0); + let index_uid_clone = index_uid.clone(); + let mut mock_metastore = MockMetastoreService::new(); + setup_maintenance_enable(&mut mock_metastore); + mock_metastore + .expect_list_indexes_metadata() + .returning(|_| Ok(ListIndexesMetadataResponse::for_test(Vec::new()))); + mock_metastore + .expect_create_index() + .return_once(move |req| { + // re-serialize the received requested config + let index_config = req.deserialize_index_config().unwrap(); + let source_configs = req.deserialize_source_configs().unwrap(); + let mut index_metadata = IndexMetadata::new(index_config); + index_metadata.index_uid = index_uid_clone.clone(); + for source_config in source_configs { + index_metadata.add_source(source_config).unwrap(); + } + let index_metadata_json = serde_json::to_string(&index_metadata).unwrap(); + Ok(CreateIndexResponse { + index_uid: Some(index_uid_clone), + index_metadata_json, + }) + }); + + let cluster_config = ClusterConfig::for_test(); + let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); + let (control_plane_mailbox, control_plane_handle, _readiness_rx) = ControlPlane::spawn( + &universe, + cluster_config, + node_1.clone(), + cluster_change_stream_factory, + indexer_pool.clone(), + ingester_pool, + MetastoreServiceClient::from_mock(mock_metastore), + ); + + // Wait for a first (empty) plan to be calculated. + wait_until_predicate( + || observe_current_plan(&control_plane_handle).map(|plan| plan.is_some()), + Duration::from_secs(5), + Duration::from_millis(100), + ) + .await + .unwrap(); + + // Enable maintenance mode. + control_plane_mailbox + .ask(EnableMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); + + let original_physical_plan = observe_current_plan(&control_plane_handle).await; + + // Create index in maintenance mode + let index_config = IndexConfig::for_test("test-index", "ram:///test-index"); + let kafka_source = SourceConfig::for_test( + "kafka-source", + SourceParams::Kafka(KafkaSourceParams { + topic: "test-topic".to_string(), + client_log_level: None, + enable_backfill_mode: false, + client_params: json!({}), + }), + ); + let create_index_request = + CreateIndexRequest::try_from_index_and_source_configs(&index_config, &[kafka_source]) + .unwrap(); + let create_result = control_plane_mailbox + .ask_for_res(create_index_request) + .await; + assert!(create_result.is_ok()); + assert_eq!(create_result.unwrap().index_uid(), &index_uid); + // Check that plan rebuild is skipped + universe.sleep(Duration::from_secs(60)).await; + assert_eq!( + original_physical_plan, + observe_current_plan(&control_plane_handle).await, + "physical plan should not change after creating index in maintenance mode" + ); + + // Add another node + let node_2: NodeId = "test-node-2".into(); + let _indexing_inbox_2 = + add_test_indexer_with_mailbox(&universe, &indexer_pool, node_2.clone()); + // Check that the rebuild is still skipped + universe.sleep(Duration::from_secs(60)).await; + assert_eq!( + original_physical_plan, + observe_current_plan(&control_plane_handle).await, + "physical plan should not change after adding new node in maintenance mode" + ); + + universe.assert_quit().await; + } + + #[tokio::test] + async fn test_maintenance_mode_allows_delete_index() { + let universe = Universe::with_accelerated_time(); + let self_node_id: NodeId = "test-node".into(); + let indexer_pool = IndexerPool::default(); + let ingester_pool = IngesterPool::default(); + + let mut mock_metastore = MockMetastoreService::new(); + setup_maintenance_enable(&mut mock_metastore); + mock_metastore + .expect_list_indexes_metadata() + .returning(|_| Ok(ListIndexesMetadataResponse::for_test(Vec::new()))); + mock_metastore + .expect_delete_index() + .return_once(|_| Ok(EmptyResponse {})); + + let cluster_config = ClusterConfig::for_test(); + let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); + let (control_plane_mailbox, _control_plane_handle, _readiness_rx) = ControlPlane::spawn( + &universe, + cluster_config, + self_node_id, + cluster_change_stream_factory, + indexer_pool, + ingester_pool, + MetastoreServiceClient::from_mock(mock_metastore), + ); + + // Enable maintenance mode. + control_plane_mailbox + .ask(EnableMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); + + // Delete index in maintenance mode — should succeed, but plan rebuild is skipped. + let index_uid = IndexUid::for_test("test-index", 0); + let delete_index_request = DeleteIndexRequest { + index_uid: Some(index_uid), + }; + let delete_result = control_plane_mailbox + .ask(delete_index_request) + .await + .unwrap(); + assert!(delete_result.is_ok()); + + universe.assert_quit().await; + } + + #[tokio::test] + async fn test_maintenance_mode_allows_add_source() { + let universe = Universe::with_accelerated_time(); + let self_node_id: NodeId = "test-node".into(); + let indexer_pool = IndexerPool::default(); + let ingester_pool = IngesterPool::default(); + + // Pre-load an index with an enabled ingest_v2 source so that + // `create_or_enable_ingest_v2_sources_if_necessary` does not call `add_source` on + // startup and consume the mock expectation meant for the test's own call. + let mut index_metadata = IndexMetadata::for_test("test-index", "ram:///test-index"); + let mut ingest_v2_source = SourceConfig::ingest_v2(); + ingest_v2_source.enabled = true; + index_metadata.add_source(ingest_v2_source).unwrap(); + let mut mock_metastore = MockMetastoreService::new(); + setup_maintenance_enable(&mut mock_metastore); + mock_metastore + .expect_list_indexes_metadata() + .return_once(move |_| Ok(ListIndexesMetadataResponse::for_test(vec![index_metadata]))); + mock_metastore + .expect_list_shards() + .return_once(|_| Ok(ListShardsResponse::default())); + mock_metastore + .expect_add_source() + .return_once(|_| Ok(EmptyResponse {})); + + let cluster_config = ClusterConfig::for_test(); + let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); + let (control_plane_mailbox, _control_plane_handle, _readiness_rx) = ControlPlane::spawn( + &universe, + cluster_config, + self_node_id, + cluster_change_stream_factory, + indexer_pool, + ingester_pool, + MetastoreServiceClient::from_mock(mock_metastore), + ); + + // Enable maintenance mode. + control_plane_mailbox + .ask(EnableMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); + + // Add source in maintenance mode — should succeed, but plan rebuild is skipped. + let index_uid = IndexUid::for_test("test-index", 0); + let source_config = SourceConfig::for_test("test-source", SourceParams::void()); + let add_source_request = AddSourceRequest { + index_uid: Some(index_uid), + source_config_json: serde_json::to_string(&source_config).unwrap(), + }; + let result = control_plane_mailbox.ask(add_source_request).await.unwrap(); + assert!(result.is_ok()); + + universe.assert_quit().await; + } + + #[tokio::test] + async fn test_maintenance_mode_enable_disable_cycle() { + let universe = Universe::with_accelerated_time(); + let self_node_id: NodeId = "test-node".into(); + let indexer_pool = IndexerPool::default(); + let ingester_pool = IngesterPool::default(); + + let mut mock_metastore = MockMetastoreService::new(); + mock_metastore + .expect_get_kv() + .returning(|_| Ok(GetKvResponse { value: None })); + mock_metastore + .expect_set_kv() + .returning(|_| Ok(EmptyResponse {})); + mock_metastore + .expect_delete_kv() + .returning(|_| Ok(EmptyResponse {})); + mock_metastore + .expect_list_indexes_metadata() + .returning(|_| Ok(ListIndexesMetadataResponse::for_test(Vec::new()))); + + let cluster_config = ClusterConfig::for_test(); + let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); + let (control_plane_mailbox, _control_plane_handle, _readiness_rx) = ControlPlane::spawn( + &universe, + cluster_config, + self_node_id, + cluster_change_stream_factory, + indexer_pool, + ingester_pool, + MetastoreServiceClient::from_mock(mock_metastore), + ); + + // Initially not in maintenance mode. + let status = control_plane_mailbox + .ask(GetMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); + assert!(!status.is_maintenance_mode); + + // Enable. + let enable_resp = control_plane_mailbox + .ask(EnableMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); + assert!(!enable_resp.frozen_plan_json.is_empty()); + + // Check status. + let status = control_plane_mailbox + .ask(GetMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); + assert!(status.is_maintenance_mode); + assert!(status.enabled_at.is_some()); + + // Enable again — should fail. + let double_enable = control_plane_mailbox + .ask(EnableMaintenanceModeRequest {}) + .await + .unwrap(); + assert!(double_enable.is_err()); + + // Disable. + let disable_resp = control_plane_mailbox + .ask(DisableMaintenanceModeRequest {}) + .await + .unwrap(); + assert!(disable_resp.is_ok()); + + // Check status again. + let status = control_plane_mailbox + .ask(GetMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); + assert!(!status.is_maintenance_mode); + + // Disable again — should fail. + let double_disable = control_plane_mailbox + .ask(DisableMaintenanceModeRequest {}) + .await + .unwrap(); + assert!(double_disable.is_err()); + + universe.assert_quit().await; + } + + #[tokio::test] + async fn test_maintenance_mode_observable_state() { + let universe = Universe::with_accelerated_time(); + let self_node_id: NodeId = "test-node".into(); + let indexer_pool = IndexerPool::default(); + let ingester_pool = IngesterPool::default(); + + let mut mock_metastore = MockMetastoreService::new(); + setup_maintenance_enable(&mut mock_metastore); + mock_metastore + .expect_list_indexes_metadata() + .returning(|_| Ok(ListIndexesMetadataResponse::for_test(Vec::new()))); + + let cluster_config = ClusterConfig::for_test(); + let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); + let (control_plane_mailbox, control_plane_handle, _readiness_rx) = ControlPlane::spawn( + &universe, + cluster_config, + self_node_id, + cluster_change_stream_factory, + indexer_pool, + ingester_pool, + MetastoreServiceClient::from_mock(mock_metastore), + ); + + // Observe initial state. + let obs = control_plane_handle.process_pending_and_observe().await; + let state = obs.state_opt.as_ref().unwrap(); + assert!(!state.maintenance_mode); + + // Enable maintenance mode. + control_plane_mailbox + .ask(EnableMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); + + // Give the supervisor time to observe the inner actor's updated state. + universe.sleep(Duration::from_secs(1)).await; + + let obs = control_plane_handle.process_pending_and_observe().await; + let state = obs.state_opt.as_ref().unwrap(); + assert!(state.maintenance_mode); + + universe.assert_quit().await; + } + + #[tokio::test] + async fn test_maintenance_mode_allows_toggle_source() { + let universe = Universe::with_accelerated_time(); + let self_node_id: NodeId = "test-node".into(); + let indexer_pool = IndexerPool::default(); + let ingester_pool = IngesterPool::default(); + + // Pre-load an index with the test source and an enabled ingest_v2 source so that + // `create_or_enable_ingest_v2_sources_if_necessary` does not call `add_source` on + // startup and trigger unexpected mock calls. + let mut index_metadata = IndexMetadata::for_test("test-index", "ram:///test-index"); + let test_source_config = SourceConfig::for_test("test-source", SourceParams::void()); + index_metadata.add_source(test_source_config).unwrap(); + let mut ingest_v2_source = SourceConfig::ingest_v2(); + ingest_v2_source.enabled = true; + index_metadata.add_source(ingest_v2_source).unwrap(); + + let mut mock_metastore = MockMetastoreService::new(); + setup_maintenance_enable(&mut mock_metastore); + mock_metastore + .expect_list_indexes_metadata() + .return_once(move |_| Ok(ListIndexesMetadataResponse::for_test(vec![index_metadata]))); + mock_metastore + .expect_list_shards() + .return_once(|_| Ok(ListShardsResponse::default())); + mock_metastore + .expect_toggle_source() + .return_once(|_| Ok(EmptyResponse {})); + + let cluster_config = ClusterConfig::for_test(); + let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); + let (control_plane_mailbox, _control_plane_handle, _readiness_rx) = ControlPlane::spawn( + &universe, + cluster_config, + self_node_id, + cluster_change_stream_factory, + indexer_pool, + ingester_pool, + MetastoreServiceClient::from_mock(mock_metastore), + ); + + // Enable maintenance mode. + control_plane_mailbox + .ask(EnableMaintenanceModeRequest {}) .await - { - return convert_metastore_error::<()>(metastore_error).map(|_| ()); - } - self.indexing_scheduler.rebuild_plan(&self.model); - Ok(()) + .unwrap() + .unwrap(); + + // Toggle source in maintenance mode — should succeed, but plan rebuild is skipped. + let index_uid = IndexUid::for_test("test-index", 0); + let toggle_request = ToggleSourceRequest { + index_uid: Some(index_uid), + source_id: "test-source".to_string(), + enable: false, + }; + let result = control_plane_mailbox.ask(toggle_request).await.unwrap(); + assert!(result.is_ok()); + + universe.assert_quit().await; } -} -#[async_trait] -impl Handler for ControlPlane { - type Reply = (); + #[tokio::test] + async fn test_maintenance_mode_allows_get_or_create_open_shards() { + // In maintenance mode, GetOrCreateOpenShards should still work for existing sources + // (ingest must continue), but auto_create_indexes is skipped. + let universe = Universe::with_accelerated_time(); + let self_node_id: NodeId = "test-node".into(); + let indexer_pool = IndexerPool::default(); + let ingester_pool = IngesterPool::default(); - async fn handle( - &mut self, - message: RebalanceShardsCallback, - _ctx: &ActorContext, - ) -> Result { - let num_closed_shards = message.closed_shards.len(); - debug!("closing {num_closed_shards} shards after rebalance"); + let mut mock_metastore = MockMetastoreService::new(); + setup_maintenance_enable(&mut mock_metastore); + mock_metastore + .expect_list_indexes_metadata() + .returning(|_| Ok(ListIndexesMetadataResponse::for_test(Vec::new()))); + // Note: no expect_find_index_template_matches — if auto_create was NOT skipped, + // this would panic due to unexpected call. - for closed_shard in message.closed_shards { - let shard_id = closed_shard.shard_id().clone(); - let source_uid = SourceUid { - index_uid: closed_shard.index_uid().clone(), - source_id: closed_shard.source_id, - }; - self.model.close_shards(&source_uid, &[shard_id]); - } - // We drop the rebalance guard explicitly here to put some emphasis on where a the rebalance - // lock is released. - drop(message.rebalance_guard); - Ok(()) - } -} + let cluster_config = ClusterConfig::for_test(); + let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); + let (control_plane_mailbox, _control_plane_handle, _readiness_rx) = ControlPlane::spawn( + &universe, + cluster_config, + self_node_id, + cluster_change_stream_factory, + indexer_pool, + ingester_pool, + MetastoreServiceClient::from_mock(mock_metastore), + ); -fn spawn_watch_indexers_task( - weak_mailbox: WeakMailbox, - cluster_change_stream: ClusterChangeStream, -) { - tokio::spawn(watcher_indexers(weak_mailbox, cluster_change_stream)); -} + // Enable maintenance mode. + control_plane_mailbox + .ask(EnableMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); -async fn watcher_indexers( - weak_mailbox: WeakMailbox, - mut cluster_change_stream: ClusterChangeStream, -) { - while let Some(cluster_change) = cluster_change_stream.next().await { - let Some(mailbox) = weak_mailbox.upgrade() else { - return; + // Send a GetOrCreateOpenShards with a nonexistent index. + // In maintenance, auto_create is skipped, so the index won't be found. + // The ingest controller will report a failure for unknown indexes, which is expected. + let request = GetOrCreateOpenShardsRequest { + subrequests: vec![GetOrCreateOpenShardsSubrequest { + subrequest_id: 0, + index_id: "nonexistent-index".to_string(), + source_id: "source".to_string(), + }], + closed_shards: Vec::new(), + unavailable_leaders: Vec::new(), }; - match cluster_change { - ClusterChange::Add(node) => { - if node.enabled_services().contains(&QuickwitService::Indexer) - && let Err(error) = mailbox.send_message(IndexerJoined(node)).await - { - error!(%error, "failed to forward `IndexerJoined` event to control plane"); - } - } - ClusterChange::Remove(node) => { - if node.enabled_services().contains(&QuickwitService::Indexer) - && let Err(error) = mailbox.send_message(IndexerLeft(node)).await - { - error!(%error, "failed to forward `IndexerLeft` event to control plane"); - } + let result = control_plane_mailbox.ask(request).await.unwrap(); + // The request should succeed at the handler level. + // It may fail internally because the index doesn't exist, but that's expected. + match result { + Ok(response) => { + // The response should contain a failure for the unknown index. + assert!(!response.failures.is_empty()); + assert_eq!( + response.failures[0].reason(), + GetOrCreateOpenShardsFailureReason::IndexNotFound + ); } - ClusterChange::Update(_) => { - // We are not interested in updates (yet). + Err(_err) => { + // Any internal error is acceptable here (index not found, etc.). } } - } -} - -#[cfg(test)] -mod tests { - use std::num::NonZero; - use std::sync::Arc; - - use mockall::Sequence; - use quickwit_actors::{AskError, Observe, SupervisorMetrics}; - use quickwit_cluster::ClusterChangeStreamFactoryForTest; - use quickwit_config::{ - CLI_SOURCE_ID, INGEST_V2_SOURCE_ID, IndexConfig, KafkaSourceParams, SourceParams, - }; - use quickwit_indexing::IndexingService; - use quickwit_metastore::{ - CreateIndexRequestExt, IndexMetadata, ListIndexesMetadataResponseExt, - }; - use quickwit_proto::control_plane::{ - GetOrCreateOpenShardsFailureReason, GetOrCreateOpenShardsSubrequest, - }; - use quickwit_proto::indexing::{ - ApplyIndexingPlanRequest, ApplyIndexingPlanResponse, CpuCapacity, IndexingServiceClient, - MockIndexingService, - }; - use quickwit_proto::ingest::ingester::{ - IngesterServiceClient, InitShardSuccess, InitShardsResponse, MockIngesterService, - RetainShardsResponse, - }; - use quickwit_proto::ingest::{Shard, ShardPKey, ShardState}; - use quickwit_proto::metastore::{ - DeleteShardsResponse, EntityKind, FindIndexTemplateMatchesResponse, - ListIndexesMetadataRequest, ListIndexesMetadataResponse, ListShardsRequest, - ListShardsResponse, ListShardsSubresponse, MetastoreError, MockMetastoreService, - OpenShardSubresponse, OpenShardsResponse, SourceType, - }; - use quickwit_proto::types::{DocMappingUid, Position}; - use tokio::sync::Mutex; - use super::*; - use crate::IndexerNodeInfo; + universe.assert_quit().await; + } #[tokio::test] async fn test_control_plane_create_index() { @@ -1211,6 +2015,7 @@ mod tests { let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); let index_uid: IndexUid = IndexUid::for_test("test-index", 0); let index_uid_clone = index_uid.clone(); mock_metastore @@ -1268,6 +2073,7 @@ mod tests { let index_uid: IndexUid = IndexUid::for_test("test-index", 0); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); let index_uid_clone = index_uid.clone(); mock_metastore .expect_delete_index() @@ -1314,6 +2120,7 @@ mod tests { .unwrap(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore .expect_add_source() .withf(|add_source_request| { @@ -1411,6 +2218,7 @@ mod tests { .unwrap(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore .expect_update_source() .withf(move |update_source_request| { @@ -1478,6 +2286,7 @@ mod tests { index_metadata.add_source(test_source_config).unwrap(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore .expect_list_indexes_metadata() .return_once(|_| Ok(ListIndexesMetadataResponse::for_test(vec![index_metadata]))); @@ -1548,6 +2357,7 @@ mod tests { let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); let index_uid: IndexUid = IndexUid::for_test("test-index", 0); let index_uid_clone = index_uid.clone(); mock_metastore @@ -1596,6 +2406,7 @@ mod tests { let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); let index_uid: IndexUid = IndexUid::for_test("test-index", 0); mock_metastore .expect_list_indexes_metadata() @@ -1674,6 +2485,7 @@ mod tests { let indexer_pool = IndexerPool::default(); let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); let mut index_0 = IndexMetadata::for_test("test-index-0", "ram:///test-index-0"); let source = SourceConfig::ingest_v2(); @@ -1808,18 +2620,14 @@ mod tests { let universe = Universe::with_accelerated_time(); let node_id = NodeId::new("test-control-plane".to_string()); let indexer_pool = IndexerPool::default(); - let (client_mailbox, client_inbox) = universe.create_test_mailbox(); - let client = IndexingServiceClient::from_mailbox::(client_mailbox); - let indexer_node_info = IndexerNodeInfo { - node_id: NodeId::new("test-indexer".to_string()), - generation_id: 0, - client, - indexing_tasks: Vec::new(), - indexing_capacity: CpuCapacity::from_cpu_millis(4_000), - }; - indexer_pool.insert(indexer_node_info.node_id.clone(), indexer_node_info); + let client_inbox = add_test_indexer_with_mailbox( + &universe, + &indexer_pool, + NodeId::new("test-indexer".to_string()), + ); let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); let mut index_0 = IndexMetadata::for_test("test-index-0", "ram:///test-index-0"); let mut source = SourceConfig::ingest_v2(); @@ -1906,7 +2714,7 @@ mod tests { control_plane_mailbox.ask(Observe).await.unwrap(); let last_applied_physical_plan = control_plane_obs .indexing_scheduler - .last_applied_physical_plan + .current_targeted_physical_plan .unwrap(); let indexing_tasks = last_applied_physical_plan .indexing_tasks_per_indexer() @@ -1937,7 +2745,7 @@ mod tests { control_plane_mailbox.ask(Observe).await.unwrap(); let last_applied_physical_plan = control_plane_obs .indexing_scheduler - .last_applied_physical_plan + .current_targeted_physical_plan .unwrap(); let indexing_tasks = last_applied_physical_plan .indexing_tasks_per_indexer() @@ -1957,18 +2765,14 @@ mod tests { let universe = Universe::with_accelerated_time(); let node_id = NodeId::new("test-control-plane".to_string()); let indexer_pool = IndexerPool::default(); - let (client_mailbox, _client_inbox) = universe.create_test_mailbox(); - let client = IndexingServiceClient::from_mailbox::(client_mailbox); - let indexer_node_info = IndexerNodeInfo { - node_id: NodeId::new("test-indexer".to_string()), - generation_id: 0, - client, - indexing_tasks: Vec::new(), - indexing_capacity: CpuCapacity::from_cpu_millis(4_000), - }; - indexer_pool.insert(indexer_node_info.node_id.clone(), indexer_node_info); + let _indexing_inbox = add_test_indexer_with_mailbox( + &universe, + &indexer_pool, + NodeId::new("test-indexer".to_string()), + ); let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); let mut index_metadata = IndexMetadata::for_test("test-index", "ram:///test-index"); let mut source_config = SourceConfig::ingest_v2(); @@ -2035,18 +2839,14 @@ mod tests { let universe = Universe::default(); let node_id = NodeId::new("test-control-plane".to_string()); let indexer_pool = IndexerPool::default(); - let (client_mailbox, _client_inbox) = universe.create_test_mailbox(); - let client = IndexingServiceClient::from_mailbox::(client_mailbox); - let indexer_node_info = IndexerNodeInfo { - node_id: NodeId::new("test-indexer".to_string()), - generation_id: 0, - client, - indexing_tasks: Vec::new(), - indexing_capacity: CpuCapacity::from_cpu_millis(4_000), - }; - indexer_pool.insert(indexer_node_info.node_id.clone(), indexer_node_info); + let _indexing_inbox = add_test_indexer_with_mailbox( + &universe, + &indexer_pool, + NodeId::new("test-indexer".to_string()), + ); let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); let mut index_0 = IndexMetadata::for_test("test-index-0", "ram:///test-index-0"); let mut source = SourceConfig::ingest_v2(); @@ -2142,6 +2942,7 @@ mod tests { let index_0_clone = index_0.clone(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore .expect_list_indexes_metadata() .times(1) @@ -2263,6 +3064,7 @@ mod tests { let index_uid_clone = index_0.index_uid.clone(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore.expect_delete_source().return_once( move |delete_source_request: DeleteSourceRequest| { assert_eq!(delete_source_request.index_uid(), &index_uid_clone); @@ -2346,6 +3148,7 @@ mod tests { let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore .expect_list_indexes_metadata() @@ -2476,10 +3279,27 @@ mod tests { let indexer_pool = IndexerPool::default(); let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore .expect_list_indexes_metadata() .return_once(|_| Ok(ListIndexesMetadataResponse::for_test(Vec::new()))); let metastore = MetastoreServiceClient::from_mock(mock_metastore); + + // Create mock maintenance persistence metastore + let mut mock_persistence_metastore = MockMetastoreService::new(); + mock_persistence_metastore + .expect_get_kv() + .returning(|_| Ok(GetKvResponse { value: None })); + mock_persistence_metastore + .expect_set_kv() + .returning(|_| Ok(EmptyResponse {})); + mock_persistence_metastore + .expect_delete_kv() + .returning(|_| Ok(EmptyResponse {})); + let maintenance_persistence = MetastoreKvPersistence::new( + MetastoreServiceClient::from_mock(mock_persistence_metastore), + ); + let disable_control_loop = true; let (_control_plane_mailbox, control_plane_handle, _readiness_rx) = ControlPlane::spawn_inner( @@ -2491,6 +3311,7 @@ mod tests { ingester_pool, metastore, disable_control_loop, + maintenance_persistence, ); let cluster_change_stream_tx = cluster_change_stream_factory.change_stream_tx(); let indexer_node = @@ -2556,6 +3377,7 @@ mod tests { ingester_pool.insert(ingester_id, ingester); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore .expect_list_indexes_metadata() .return_once(|_| Ok(ListIndexesMetadataResponse::for_test(Vec::new()))); @@ -2664,6 +3486,248 @@ mod tests { universe.assert_quit().await; } + #[tokio::test] + async fn test_control_plane_swap_pipelines_applied_on_next_control_loop() { + let universe = Universe::default(); + let node_id = NodeId::from("test-control-plane"); + let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); + let indexer_pool = IndexerPool::default(); + let ingester_pool = IngesterPool::default(); + + // Two mock indexers that accept unlimited apply_indexing_plan calls. + let mut mock_indexer_1 = MockIndexingService::new(); + mock_indexer_1 + .expect_apply_indexing_plan() + .returning(|_| Ok(ApplyIndexingPlanResponse {})); + let mut mock_indexer_2 = MockIndexingService::new(); + mock_indexer_2 + .expect_apply_indexing_plan() + .returning(|_| Ok(ApplyIndexingPlanResponse {})); + + indexer_pool.insert( + NodeId::from("indexer-1"), + IndexerNodeInfo { + node_id: NodeId::from("indexer-1"), + generation_id: 0, + client: IndexingServiceClient::from_mock(mock_indexer_1), + indexing_tasks: Vec::new(), + indexing_capacity: CpuCapacity::from_cpu_millis(4_000), + }, + ); + indexer_pool.insert( + NodeId::from("indexer-2"), + IndexerNodeInfo { + node_id: NodeId::from("indexer-2"), + generation_id: 0, + client: IndexingServiceClient::from_mock(mock_indexer_2), + indexing_tasks: Vec::new(), + indexing_capacity: CpuCapacity::from_cpu_millis(4_000), + }, + ); + + // Two indexes, each with a single-pipeline Kafka source and an ingest-v2 source + // (so that `create_or_enable_ingest_v2_sources_if_necessary` does not call `add_source`). + let mut index_a = IndexMetadata::for_test("index-a", "ram:///index-a"); + index_a + .add_source(SourceConfig::for_test( + "kafka-source", + SourceParams::Kafka(KafkaSourceParams { + topic: "topic-a".to_string(), + client_log_level: None, + enable_backfill_mode: false, + client_params: json!({}), + }), + )) + .unwrap(); + index_a.add_source(SourceConfig::ingest_v2()).unwrap(); + + let mut index_b = IndexMetadata::for_test("index-b", "ram:///index-b"); + index_b + .add_source(SourceConfig::for_test( + "kafka-source", + SourceParams::Kafka(KafkaSourceParams { + topic: "topic-b".to_string(), + client_log_level: None, + enable_backfill_mode: false, + client_params: json!({}), + }), + )) + .unwrap(); + index_b.add_source(SourceConfig::ingest_v2()).unwrap(); + + let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); + mock_metastore + .expect_list_indexes_metadata() + .return_once(move |_| { + Ok(ListIndexesMetadataResponse::for_test(vec![ + index_a, index_b, + ])) + }); + mock_metastore + .expect_list_shards() + .return_once(|_| Ok(ListShardsResponse::default())); + + // Create mock maintenance persistence metastore + let mut mock_persistence_metastore = MockMetastoreService::new(); + mock_persistence_metastore + .expect_get_kv() + .returning(|_| Ok(GetKvResponse { value: None })); + mock_persistence_metastore + .expect_set_kv() + .returning(|_| Ok(EmptyResponse {})); + mock_persistence_metastore + .expect_delete_kv() + .returning(|_| Ok(EmptyResponse {})); + let maintenance_persistence = MetastoreKvPersistence::new( + MetastoreServiceClient::from_mock(mock_persistence_metastore), + ); + + let cluster_config = ClusterConfig::for_test(); + let (control_plane_mailbox, _control_plane_handle, _readiness_rx) = + ControlPlane::spawn_inner( + &universe, + cluster_config, + node_id, + cluster_change_stream_factory, + indexer_pool, + ingester_pool, + MetastoreServiceClient::from_mock(mock_metastore), + false, // keep the control loop enabled + maintenance_persistence, + ); + + // ── Wait for the initial plan to be built ────────────────────────── + // Use `mailbox.ask(Observe)` to get state directly from the inner + // actor (the supervisor handle only returns a cached snapshot that may + // lag behind). + let initial_state = { + let mut state = None; + for _ in 0..100 { + tokio::time::sleep(Duration::from_millis(50)).await; + let obs: ControlPlaneObservableState = + control_plane_mailbox.ask(Observe).await.unwrap(); + if obs + .indexing_scheduler + .current_targeted_physical_plan + .is_some() + { + state = Some(obs); + break; + } + } + state.expect("initial plan should have been built") + }; + + let initial_plan = initial_state + .indexing_scheduler + .current_targeted_physical_plan + .as_ref() + .unwrap(); + + // Each indexer should have exactly 1 task (4000 mcpu capacity, 3200 mcpu per pipeline). + let i1_tasks = initial_plan.indexer("indexer-1").unwrap(); + let i2_tasks = initial_plan.indexer("indexer-2").unwrap(); + assert_eq!(i1_tasks.len(), 1); + assert_eq!(i2_tasks.len(), 1); + + let idx_on_1 = i1_tasks[0].index_uid().index_id.clone(); + let idx_on_2 = i2_tasks[0].index_uid().index_id.clone(); + assert_ne!(idx_on_1, idx_on_2); + + let num_schedule_before = initial_state.indexing_scheduler.num_schedule_indexing_plan; + + // ── Swap pipelines ───────────────────────────────────────────────── + let response: SwapIndexingPipelinesResponse = control_plane_mailbox + .ask(SwapIndexingPipelinesRequest { + swaps: vec![SwapIndexingPipelinesEntry { + left_node_id: "indexer-1".to_string(), + left_index_id: idx_on_1.clone(), + right_node_id: "indexer-2".to_string(), + right_index_id: Some(idx_on_2.clone()), + }], + }) + .await + .unwrap() + .unwrap(); + assert!(response.results[0].success, "swap must succeed"); + + // Immediately after the swap, the targeted plan should reflect it. + let after_swap: ControlPlaneObservableState = + control_plane_mailbox.ask(Observe).await.unwrap(); + let plan = after_swap + .indexing_scheduler + .current_targeted_physical_plan + .as_ref() + .unwrap(); + assert_eq!( + plan.indexer("indexer-1").unwrap()[0].index_uid().index_id, + idx_on_2, + "indexer-1 should now have the index that was on indexer-2" + ); + assert_eq!( + plan.indexer("indexer-2").unwrap()[0].index_uid().index_id, + idx_on_1, + "indexer-2 should now have the index that was on indexer-1" + ); + + let num_applied_after_swap = after_swap + .indexing_scheduler + .num_applied_physical_indexing_plan; + + // ── Wait for the control loop to re-apply the (swapped) plan ─────── + // `control_running_plan` has a MIN_DURATION_BETWEEN_SCHEDULING cooldown + // (50 ms in tests). The control loop interval is 100 ms. We poll until + // the apply counter increases. + let mut reapplied = false; + for _ in 0..40 { + tokio::time::sleep(Duration::from_millis(100)).await; + let obs: ControlPlaneObservableState = + control_plane_mailbox.ask(Observe).await.unwrap(); + if obs.indexing_scheduler.num_applied_physical_indexing_plan > num_applied_after_swap { + reapplied = true; + break; + } + } + assert!( + reapplied, + "the control loop should have re-applied the plan" + ); + + // ── Verify the swapped plan is still in place after re-apply ─────── + let final_state: ControlPlaneObservableState = + control_plane_mailbox.ask(Observe).await.unwrap(); + let final_plan = final_state + .indexing_scheduler + .current_targeted_physical_plan + .as_ref() + .unwrap(); + + assert_eq!( + final_plan.indexer("indexer-1").unwrap()[0] + .index_uid() + .index_id, + idx_on_2, + "after control loop re-apply, indexer-1 should still have the swapped index" + ); + assert_eq!( + final_plan.indexer("indexer-2").unwrap()[0] + .index_uid() + .index_id, + idx_on_1, + "after control loop re-apply, indexer-2 should still have the swapped index" + ); + + // No rebuild should have happened; only re-applies of the existing plan. + assert_eq!( + final_state.indexing_scheduler.num_schedule_indexing_plan, num_schedule_before, + "no rebuild should have happened after the swap – the control loop should only \ + re-apply the existing plan" + ); + + universe.assert_quit().await; + } + #[tokio::test] async fn test_control_plane_get_debug_info() { let universe = Universe::with_accelerated_time(); @@ -2710,6 +3774,7 @@ mod tests { ingester_pool.insert(ingester_id, ingester); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore .expect_list_indexes_metadata() .return_once(|_| Ok(ListIndexesMetadataResponse::for_test(Vec::new()))); diff --git a/quickwit/quickwit-control-plane/src/indexing_plan.rs b/quickwit/quickwit-control-plane/src/indexing_plan.rs index befeef18232..31cce996dfa 100644 --- a/quickwit/quickwit-control-plane/src/indexing_plan.rs +++ b/quickwit/quickwit-control-plane/src/indexing_plan.rs @@ -14,13 +14,13 @@ use fnv::FnvHashMap; use quickwit_proto::indexing::IndexingTask; -use serde::Serialize; +use serde::{Deserialize, Serialize}; /// A [`PhysicalIndexingPlan`] defines the list of indexing tasks /// each indexer, identified by its node ID, should run. /// TODO(fmassot): a metastore version number will be attached to the plan /// to identify if the plan is up to date with the metastore. -#[derive(Debug, PartialEq, Clone, Serialize)] +#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)] pub struct PhysicalIndexingPlan { indexing_tasks_per_indexer_id: FnvHashMap>, } diff --git a/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs b/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs index 7feab6564e7..9d1a1b9dee3 100644 --- a/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs +++ b/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs @@ -25,12 +25,18 @@ use fnv::{FnvHashMap, FnvHashSet}; use itertools::Itertools; use once_cell::sync::OnceCell; use quickwit_common::pretty::PrettySample; -use quickwit_config::{FileSourceParams, SourceParams, indexing_pipeline_params_fingerprint}; +use quickwit_config::{ + FileSourceParams, SourceParams, disable_ingest_v1, indexing_pipeline_params_fingerprint, +}; +use quickwit_proto::control_plane::{ + ControlPlaneResult, SwapIndexingPipelinesEntry, SwapIndexingPipelinesRequest, + SwapIndexingPipelinesResponse, SwapIndexingPipelinesResult, +}; use quickwit_proto::indexing::{ ApplyIndexingPlanRequest, CpuCapacity, IndexingService, IndexingTask, PIPELINE_FULL_CAPACITY, PIPELINE_THROUGHPUT, }; -use quickwit_proto::types::NodeId; +use quickwit_proto::types::{NodeId, PipelineUid}; use scheduling::{SourceToSchedule, SourceToScheduleType}; use serde::Serialize; use tracing::{debug, info, warn}; @@ -58,7 +64,7 @@ const MAX_LOAD_PER_PIPELINE: CpuCapacity = CpuCapacity::from_cpu_millis(3_200); pub struct IndexingSchedulerState { pub num_applied_physical_indexing_plan: usize, pub num_schedule_indexing_plan: usize, - pub last_applied_physical_plan: Option, + pub current_targeted_physical_plan: Option, #[serde(skip)] pub last_applied_plan_timestamp: Option, } @@ -218,7 +224,11 @@ fn get_sources_to_schedule(model: &ControlPlaneModel) -> Vec { } SourceParams::IngestApi => { - // TODO ingest v1 is scheduled differently + if disable_ingest_v1() { + // Existing indexes might still have the _ingest-api-source + continue; + } + // Note: ingest v1 is scheduled differently sources.push(SourceToSchedule { source_uid, source_type: SourceToScheduleType::IngestV1, @@ -276,6 +286,17 @@ fn get_sources_to_schedule(model: &ControlPlaneModel) -> Vec { sources } +/// Holds the pre-validated tasks to move for a single swap entry. +/// Tasks are collected from the original plan before any modifications. +struct ValidSwapOperation { + left_node_id: String, + left_tasks: Vec, + right_node_id: String, + right_tasks: Vec, + left_index_id: String, + right_index_id: Option, +} + impl IndexingScheduler { pub fn new(cluster_id: String, self_node_id: NodeId, indexer_pool: IndexerPool) -> Self { IndexingScheduler { @@ -291,12 +312,30 @@ impl IndexingScheduler { self.state.clone() } - // Should be called whenever a change in the list of index/shard - // has happened. - // - // Prefer not calling this method directly, and instead call - // `ControlPlane::rebuild_indexing_plan_debounced`. - pub(crate) fn rebuild_plan(&mut self, model: &ControlPlaneModel) { + /// Loads a frozen indexing plan without applying it + /// to indexers or triggering any scheduling logic. + /// + /// This is used during control plane initialization when maintenance mode is active: + /// the frozen plan is restored as the `current_targeted_physical_plan` so that the + /// `control_running_plan` loop can re-apply it to indexers that restart during the + /// maintenance window. + pub(crate) fn load_frozen_plan(&mut self, plan: crate::indexing_plan::PhysicalIndexingPlan) { + self.state.current_targeted_physical_plan = Some(plan); + } + + /// Should be called whenever a change in the list of index/shard has + /// happened. + /// + /// When in maintenance mode (`is_maintenance` is true), this function exits + /// early to keep the indexing plan frozen. This design provides a simple + /// safeguard to prevent unintended plan modifications during maintenance. + /// + /// Prefer not calling this method directly, and instead call + /// `ControlPlane::rebuild_indexing_plan_debounced`. + pub(crate) fn rebuild_plan(&mut self, model: &ControlPlaneModel, is_maintenance: bool) { + if is_maintenance { + return; + } crate::metrics::CONTROL_PLANE_METRICS.schedule_total.inc(); let notify_on_drop = self.next_rebuild_tracker.start_rebuild(); @@ -327,15 +366,15 @@ impl IndexingScheduler { let new_physical_plan = build_physical_indexing_plan( &sources, &indexer_id_to_cpu_capacities, - self.state.last_applied_physical_plan.as_ref(), + self.state.current_targeted_physical_plan.as_ref(), &shard_locations, ); let shard_locality_metrics = get_shard_locality_metrics(&new_physical_plan, &shard_locations); crate::metrics::CONTROL_PLANE_METRICS.set_shard_locality_metrics(shard_locality_metrics); - if let Some(last_applied_plan) = &self.state.last_applied_physical_plan { + if let Some(current_targeted_plan) = &self.state.current_targeted_physical_plan { let plans_diff = get_indexing_plans_diff( - last_applied_plan.indexing_tasks_per_indexer(), + current_targeted_plan.indexing_tasks_per_indexer(), new_physical_plan.indexing_tasks_per_indexer(), ); // No need to apply the new plan as it is the same as the old one. @@ -351,15 +390,15 @@ impl IndexingScheduler { /// chitchat cluster state. If true, do nothing. /// - If node IDs differ, schedule a new indexing plan. /// - If indexing tasks differ, apply again the last plan. - pub(crate) fn control_running_plan(&mut self, model: &ControlPlaneModel) { - let last_applied_plan = - if let Some(last_applied_plan) = &self.state.last_applied_physical_plan { - last_applied_plan + pub(crate) fn control_running_plan(&mut self, model: &ControlPlaneModel, is_maintenance: bool) { + let current_targeted_plan = + if let Some(current_targeted) = &self.state.current_targeted_physical_plan { + current_targeted } else { // If there is no plan, the node is probably starting and the scheduler did not find // indexers yet. In this case, we want to schedule as soon as possible to find new // indexers. - self.rebuild_plan(model); + self.rebuild_plan(model, is_maintenance); return; }; if let Some(last_applied_plan_timestamp) = self.state.last_applied_plan_timestamp @@ -376,15 +415,15 @@ impl IndexingScheduler { let indexing_plans_diff = get_indexing_plans_diff( &running_indexing_tasks_by_node_id, - last_applied_plan.indexing_tasks_per_indexer(), + current_targeted_plan.indexing_tasks_per_indexer(), ); if !indexing_plans_diff.has_same_nodes() { info!(plans_diff=?indexing_plans_diff, "running plan and last applied plan node IDs differ: schedule an indexing plan"); - self.rebuild_plan(model); + self.rebuild_plan(model, is_maintenance); } else if !indexing_plans_diff.has_same_tasks() { // Some nodes may have not received their tasks, apply it again. info!(plans_diff=?indexing_plans_diff, "running tasks and last applied tasks differ: reapply last plan"); - self.apply_physical_indexing_plan(&indexers, last_applied_plan.clone(), None); + self.apply_physical_indexing_plan(&indexers, current_targeted_plan.clone(), None); } } @@ -432,7 +471,262 @@ impl IndexingScheduler { } self.state.num_applied_physical_indexing_plan += 1; self.state.last_applied_plan_timestamp = Some(Instant::now()); - self.state.last_applied_physical_plan = Some(new_physical_plan); + self.state.current_targeted_physical_plan = Some(new_physical_plan); + } + + /// Swaps indexing pipelines between indexers as requested. + /// + /// The swap is applied in 3 phases: + /// 1. Upfront contradiction check (rejects entire request on failure). + /// 2. Per-swap validation against the original (unmodified) plan. + /// 3. Atomic application of all valid swaps to a working copy of the plan. + pub(crate) fn swap_pipelines( + &mut self, + request: SwapIndexingPipelinesRequest, + ) -> ControlPlaneResult { + // Phase 0: Check that a plan exists. + let Some(original_plan) = &mut self.state.current_targeted_physical_plan else { + return Ok(SwapIndexingPipelinesResponse { + results: request + .swaps + .into_iter() + .map(|swap| SwapIndexingPipelinesResult { + swap: Some(swap.clone()), + success: false, + reason: "no indexing plan is currently applied".to_string(), + }) + .collect(), + }); + }; + + // Phase 1: Upfront contradiction check (rejects entire request on failure). + if let Err(error_response) = Self::check_swap_contradictions(&request) { + return Ok(error_response); + } + + // Phase 2: Validate each swap against the ORIGINAL plan and collect + // the tasks to move. + let mut swap_results: Vec = + Vec::with_capacity(request.swaps.len()); + let mut valid_operations: Vec = Vec::new(); + + for swap in &request.swaps { + match Self::validate_single_swap(original_plan, swap) { + Ok(operation) => { + valid_operations.push(operation); + swap_results.push(SwapIndexingPipelinesResult { + swap: Some(swap.clone()), + success: true, + reason: String::new(), + }); + } + Err(reason) => { + swap_results.push(SwapIndexingPipelinesResult { + swap: Some(swap.clone()), + success: false, + reason, + }); + } + } + } + + // Phase 3: Apply all valid swaps atomically to a working copy. + if !valid_operations.is_empty() { + for operation in &valid_operations { + Self::apply_swap_operation(original_plan, operation); + } + } + + Ok(SwapIndexingPipelinesResponse { + results: swap_results, + }) + } + + /// Validates the entire swap request for contradictions. + /// + /// A contradiction exists when: + /// - A swap entry references the same node on both sides. + /// - The same (node_id, index_id) pair appears in more than one swap entry. + /// + /// On contradiction, returns a response with all swaps marked as failed. + fn check_swap_contradictions( + request: &SwapIndexingPipelinesRequest, + ) -> Result<(), SwapIndexingPipelinesResponse> { + let mut seen_slots: FnvHashSet<(&str, &str)> = FnvHashSet::default(); + + let make_error_response = |reason: String| SwapIndexingPipelinesResponse { + results: request + .swaps + .iter() + .map(|s| SwapIndexingPipelinesResult { + swap: Some(s.clone()), + success: false, + reason: reason.clone(), + }) + .collect(), + }; + + for swap in &request.swaps { + // Reject same-node operations. + if swap.left_node_id == swap.right_node_id { + let right_index_desc = swap.right_index_id.as_deref().unwrap_or(""); + let reason = format!( + "request rejected: swap between '{}' (index '{}') and '{}' (index '{}') \ + references the same node", + swap.left_node_id, swap.left_index_id, swap.right_node_id, right_index_desc, + ); + return Err(make_error_response(reason)); + } + + let left_slot = (swap.left_node_id.as_str(), swap.left_index_id.as_str()); + + // Check for duplicate left slots across entries. + if !seen_slots.insert(left_slot) { + let reason = format!( + "request rejected: contradictory swaps — index '{}' on node '{}' is \ + referenced by multiple swap entries", + left_slot.1, left_slot.0, + ); + return Err(make_error_response(reason)); + } + + // Only check right slot for full swaps (when right_index_id is specified). + if let Some(right_index_id) = &swap.right_index_id { + let right_slot = (swap.right_node_id.as_str(), right_index_id.as_str()); + if !seen_slots.insert(right_slot) { + let reason = format!( + "request rejected: contradictory swaps — index '{}' on node '{}' is \ + referenced by multiple swap entries", + right_slot.1, right_slot.0, + ); + return Err(make_error_response(reason)); + } + } + } + + Ok(()) + } + + /// Validates a single swap entry against the original (unmodified) plan. + /// + /// When `right_index_id` is `None`, the operation is a one-way move: the left + /// index's pipelines are moved to the right node without moving any pipelines back. + fn validate_single_swap( + plan: &PhysicalIndexingPlan, + swap: &SwapIndexingPipelinesEntry, + ) -> Result { + // 1. Verify the left indexer exists in the plan. + let left_tasks = plan.indexer(&swap.left_node_id).ok_or_else(|| { + format!( + "indexer '{}' not found in the current plan", + swap.left_node_id + ) + })?; + + // 2. Collect tasks for the left index. + let left_tasks_to_move: Vec = left_tasks + .iter() + .filter(|t| t.index_uid().index_id == swap.left_index_id) + .cloned() + .collect(); + + // 3. Reject if no tasks found on the left side. + if left_tasks_to_move.is_empty() { + return Err(format!( + "no pipelines found for index '{}' on indexer '{}'", + swap.left_index_id, swap.left_node_id, + )); + } + + // 4. For full swaps, validate the right side too. For move-only operations (right_index_id + // is None), just verify the right indexer exists. + let right_tasks_to_move = if let Some(right_index_id) = &swap.right_index_id { + let right_tasks = plan.indexer(&swap.right_node_id).ok_or_else(|| { + format!( + "indexer '{}' not found in the current plan", + swap.right_node_id + ) + })?; + + let right_tasks_to_move: Vec = right_tasks + .iter() + .filter(|t| t.index_uid().index_id == *right_index_id) + .cloned() + .collect(); + + if right_tasks_to_move.is_empty() { + return Err(format!( + "no pipelines found for index '{}' on indexer '{}'", + right_index_id, swap.right_node_id, + )); + } + + if left_tasks_to_move.len() != right_tasks_to_move.len() { + return Err(format!( + "pipeline count mismatch: '{}' has {} pipeline(s) on '{}', but '{}' has {} \ + pipeline(s) on '{}'", + swap.left_index_id, + left_tasks_to_move.len(), + swap.left_node_id, + right_index_id, + right_tasks_to_move.len(), + swap.right_node_id, + )); + } + + right_tasks_to_move + } else { + // Move-only: verify the right indexer exists in the plan. + plan.indexer(&swap.right_node_id).ok_or_else(|| { + format!( + "indexer '{}' not found in the current plan", + swap.right_node_id + ) + })?; + Vec::new() + }; + + Ok(ValidSwapOperation { + left_node_id: swap.left_node_id.clone(), + left_tasks: left_tasks_to_move, + right_node_id: swap.right_node_id.clone(), + right_tasks: right_tasks_to_move, + left_index_id: swap.left_index_id.clone(), + right_index_id: swap.right_index_id.clone(), + }) + } + + /// Applies a validated swap operation to a working copy of the plan. + /// + /// When `right_index_id` is `None`, this is a one-way move: the left index's + /// pipelines are moved to the right node without any pipelines moving back. + fn apply_swap_operation(plan: &mut PhysicalIndexingPlan, operation: &ValidSwapOperation) { + let plan_map = plan.indexing_tasks_per_indexer_mut(); + + // Remove the left index's tasks from the left node. + if let Some(left_node_tasks) = plan_map.get_mut(&operation.left_node_id) { + left_node_tasks.retain(|t| t.index_uid().index_id != operation.left_index_id); + } + // For full swaps, also remove the right index's tasks from the right node. + if let (Some(right_index_id), Some(right_node_tasks)) = ( + &operation.right_index_id, + plan_map.get_mut(&operation.right_node_id), + ) { + right_node_tasks.retain(|t| t.index_uid().index_id != *right_index_id); + } + + // Move left tasks to the right node with fresh pipeline UIDs. + for task in &operation.left_tasks { + let mut moved_task = task.clone(); + moved_task.pipeline_uid = Some(PipelineUid::random()); + plan.add_indexing_task(&operation.right_node_id, moved_task); + } + // For full swaps, move right tasks to the left node with fresh pipeline UIDs. + for task in &operation.right_tasks { + let mut moved_task = task.clone(); + moved_task.pipeline_uid = Some(PipelineUid::random()); + plan.add_indexing_task(&operation.left_node_id, moved_task); + } } } @@ -543,7 +837,10 @@ fn format_indexing_task_map( const MAX_INDEXES: usize = 10; let mut index_displayed = 0; write!(formatter, "{{")?; - let mut indexer_iter = indexing_tasks.iter().enumerate(); + let mut indexer_iter = indexing_tasks + .iter() + .filter(|(_, tasks)| !tasks.is_empty()) + .enumerate(); for (i, (index_name, tasks)) in &mut indexer_iter { if i != 0 { write!(formatter, ", ")?; @@ -610,14 +907,14 @@ fn format_indexing_task_map( /// the last plan applied by the scheduler. fn get_indexing_plans_diff<'a>( running_plan: &'a FnvHashMap>, - last_applied_plan: &'a FnvHashMap>, + current_targeted_plan: &'a FnvHashMap>, ) -> IndexingPlansDiff<'a> { // Nodes diff. let running_node_ids: FnvHashSet<&str> = running_plan .keys() .map(|node_id| node_id.as_str()) .collect(); - let planned_node_ids: FnvHashSet<&str> = last_applied_plan + let planned_node_ids: FnvHashSet<&str> = current_targeted_plan .keys() .map(|node_id| node_id.as_str()) .collect(); @@ -638,7 +935,7 @@ fn get_indexing_plans_diff<'a>( .get(*node_id) .map(Vec::as_slice) .unwrap_or_else(|| &[]); - let last_applied_tasks = last_applied_plan + let last_applied_tasks = current_targeted_plan .get(*node_id) .map(Vec::as_slice) .unwrap_or_else(|| &[]); @@ -705,6 +1002,10 @@ mod tests { use proptest::{prop_compose, proptest}; use quickwit_config::{IndexConfig, KafkaSourceParams, SourceConfig, SourceParams}; use quickwit_metastore::IndexMetadata; + use quickwit_proto::control_plane::{SwapIndexingPipelinesEntry, SwapIndexingPipelinesRequest}; + use quickwit_proto::indexing::{ + ApplyIndexingPlanResponse, IndexingServiceClient, MockIndexingService, + }; use quickwit_proto::types::{IndexUid, PipelineUid, ShardId, SourceUid}; use super::*; @@ -871,6 +1172,620 @@ mod tests { } } + fn make_test_task(index_id: &str, source_id: &str, pipeline_uid: u128) -> IndexingTask { + IndexingTask { + index_uid: Some(IndexUid::for_test(index_id, 0)), + source_id: source_id.to_string(), + pipeline_uid: Some(PipelineUid::for_test(pipeline_uid)), + shard_ids: Vec::new(), + params_fingerprint: 0, + } + } + + fn make_swap_entry( + left_node: &str, + left_index: &str, + right_node: &str, + right_index: &str, + ) -> SwapIndexingPipelinesEntry { + SwapIndexingPipelinesEntry { + left_node_id: left_node.to_string(), + left_index_id: left_index.to_string(), + right_node_id: right_node.to_string(), + right_index_id: Some(right_index.to_string()), + } + } + + fn make_move_entry( + left_node: &str, + left_index: &str, + right_node: &str, + ) -> SwapIndexingPipelinesEntry { + SwapIndexingPipelinesEntry { + left_node_id: left_node.to_string(), + left_index_id: left_index.to_string(), + right_node_id: right_node.to_string(), + right_index_id: None, + } + } + + fn build_test_scheduler_with_plan(plan: PhysicalIndexingPlan) -> IndexingScheduler { + let indexer_pool = IndexerPool::default(); + for node_id in plan.indexing_tasks_per_indexer().keys() { + let mut mock_indexer = MockIndexingService::new(); + mock_indexer + .expect_apply_indexing_plan() + .returning(|_| Ok(ApplyIndexingPlanResponse {})); + let indexer_info = IndexerNodeInfo { + node_id: NodeId::from(node_id.as_str()), + generation_id: 0, + client: IndexingServiceClient::from_mock(mock_indexer), + indexing_tasks: Vec::new(), + indexing_capacity: CpuCapacity::from_cpu_millis(4_000), + }; + indexer_pool.insert(indexer_info.node_id.clone(), indexer_info); + } + let mut scheduler = + IndexingScheduler::new("test-cluster".to_string(), "test-node".into(), indexer_pool); + scheduler.state.current_targeted_physical_plan = Some(plan); + scheduler + } + + #[tokio::test] + async fn test_swap_pipelines_basic() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + let task_a = make_test_task("index-a", "source-1", 1); + let task_b = make_test_task("index-b", "source-1", 2); + plan.add_indexing_task("indexer-1", task_a.clone()); + plan.add_indexing_task("indexer-2", task_b.clone()); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-a", + "indexer-2", + "index-b", + )], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert_eq!(response.results.len(), 1); + assert!(response.results[0].success); + + let new_plan = scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(); + // index-a should now be on indexer-2 + let indexer_2_tasks = new_plan.indexer("indexer-2").unwrap(); + assert_eq!(indexer_2_tasks.len(), 1); + assert_eq!(indexer_2_tasks[0].index_uid().index_id, "index-a"); + // index-b should now be on indexer-1 + let indexer_1_tasks = new_plan.indexer("indexer-1").unwrap(); + assert_eq!(indexer_1_tasks.len(), 1); + assert_eq!(indexer_1_tasks[0].index_uid().index_id, "index-b"); + } + + #[tokio::test] + async fn test_swap_pipelines_count_mismatch() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 2)); + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-1", 3)); + + let mut scheduler = build_test_scheduler_with_plan(plan.clone()); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-a", + "indexer-2", + "index-b", + )], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert_eq!(response.results.len(), 1); + assert!(!response.results[0].success); + assert!( + response.results[0] + .reason + .contains("pipeline count mismatch") + ); + + // Plan should be unchanged. + assert_eq!( + scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(), + &plan, + ); + } + + #[tokio::test] + async fn test_swap_pipelines_unknown_indexer() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&["indexer-1".to_string()]); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-a", + "indexer-999", + "index-b", + )], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert!(!response.results[0].success); + assert!(response.results[0].reason.contains("not found")); + } + + #[tokio::test] + async fn test_swap_pipelines_no_pipelines_for_index() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-1", 2)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-NONEXISTENT", + "indexer-2", + "index-b", + )], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert!(!response.results[0].success); + assert!(response.results[0].reason.contains("no pipelines found")); + } + + #[tokio::test] + async fn test_swap_pipelines_multiple_swaps_partial_success() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + "indexer-3".to_string(), + ]); + // Valid swap pair: 1 pipeline each. + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-1", 2)); + // Invalid swap pair: count mismatch (2 vs 1). + plan.add_indexing_task("indexer-2", make_test_task("index-c", "source-1", 3)); + plan.add_indexing_task("indexer-2", make_test_task("index-c", "source-1", 4)); + plan.add_indexing_task("indexer-3", make_test_task("index-d", "source-1", 5)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![ + make_swap_entry("indexer-1", "index-a", "indexer-2", "index-b"), + make_swap_entry("indexer-2", "index-c", "indexer-3", "index-d"), + ], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert_eq!(response.results.len(), 2); + assert!(response.results[0].success); + assert!(!response.results[1].success); + assert!( + response.results[1] + .reason + .contains("pipeline count mismatch") + ); + + // The first swap should have been applied. + let new_plan = scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(); + let indexer_1_tasks = new_plan.indexer("indexer-1").unwrap(); + assert_eq!(indexer_1_tasks.len(), 1); + assert_eq!(indexer_1_tasks[0].index_uid().index_id, "index-b"); + } + + #[tokio::test] + async fn test_swap_pipelines_no_plan() { + let indexer_pool = IndexerPool::default(); + let mut scheduler = + IndexingScheduler::new("test-cluster".to_string(), "test-node".into(), indexer_pool); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-a", + "indexer-2", + "index-b", + )], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert!(!response.results[0].success); + assert!(response.results[0].reason.contains("no indexing plan")); + } + + #[tokio::test] + async fn test_swap_pipelines_same_node_rejected() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&["indexer-1".to_string()]); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + plan.add_indexing_task("indexer-1", make_test_task("index-b", "source-1", 2)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-a", + "indexer-1", + "index-b", + )], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert!(!response.results[0].success); + assert!(response.results[0].reason.contains("same node")); + } + + #[tokio::test] + async fn test_swap_pipelines_contradiction_same_slot() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + "indexer-3".to_string(), + ]); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-1", 2)); + plan.add_indexing_task("indexer-3", make_test_task("index-c", "source-1", 3)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + // Both swaps try to move index-a from indexer-1. + let request = SwapIndexingPipelinesRequest { + swaps: vec![ + make_swap_entry("indexer-1", "index-a", "indexer-2", "index-b"), + make_swap_entry("indexer-1", "index-a", "indexer-3", "index-c"), + ], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + // ALL swaps should be rejected. + assert_eq!(response.results.len(), 2); + assert!(!response.results[0].success); + assert!(!response.results[1].success); + assert!(response.results[0].reason.contains("contradictory")); + } + + #[tokio::test] + async fn test_swap_pipelines_contradiction_does_not_apply_any() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + "indexer-3".to_string(), + ]); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-1", 2)); + plan.add_indexing_task("indexer-3", make_test_task("index-c", "source-1", 3)); + + let mut scheduler = build_test_scheduler_with_plan(plan.clone()); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![ + make_swap_entry("indexer-1", "index-a", "indexer-2", "index-b"), + make_swap_entry("indexer-1", "index-a", "indexer-3", "index-c"), + ], + }; + let _response = scheduler.swap_pipelines(request).unwrap(); + + // Plan should be completely unchanged. + assert_eq!( + scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(), + &plan, + ); + } + + #[tokio::test] + async fn test_swap_pipelines_fresh_pipeline_uids() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + let task_a = make_test_task("index-a", "source-1", 100); + let task_b = make_test_task("index-b", "source-1", 200); + let original_uid_a = task_a.pipeline_uid; + let original_uid_b = task_b.pipeline_uid; + plan.add_indexing_task("indexer-1", task_a); + plan.add_indexing_task("indexer-2", task_b); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-a", + "indexer-2", + "index-b", + )], + }; + scheduler.swap_pipelines(request).unwrap(); + + let new_plan = scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(); + let moved_a = &new_plan.indexer("indexer-2").unwrap()[0]; + let moved_b = &new_plan.indexer("indexer-1").unwrap()[0]; + // Pipeline UIDs must be fresh (different from originals). + assert_ne!(moved_a.pipeline_uid, original_uid_a); + assert_ne!(moved_b.pipeline_uid, original_uid_b); + // And different from each other. + assert_ne!(moved_a.pipeline_uid, moved_b.pipeline_uid); + } + + #[tokio::test] + async fn test_swap_pipelines_multiple_sources_same_index() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + // index-a has 2 sources on indexer-1. + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-kafka", 1)); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-ingest", 2)); + // index-b has 2 sources on indexer-2. + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-kafka", 3)); + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-ingest", 4)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-a", + "indexer-2", + "index-b", + )], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert!(response.results[0].success); + + let new_plan = scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(); + // Both sources of index-a should now be on indexer-2. + let indexer_2_tasks = new_plan.indexer("indexer-2").unwrap(); + assert_eq!(indexer_2_tasks.len(), 2); + assert!( + indexer_2_tasks + .iter() + .all(|t| t.index_uid().index_id == "index-a") + ); + // Both sources of index-b should now be on indexer-1. + let indexer_1_tasks = new_plan.indexer("indexer-1").unwrap(); + assert_eq!(indexer_1_tasks.len(), 2); + assert!( + indexer_1_tasks + .iter() + .all(|t| t.index_uid().index_id == "index-b") + ); + } + + #[tokio::test] + async fn test_swap_pipelines_preserves_other_tasks() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + // Tasks being swapped. + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-1", 2)); + // Other tasks that should not be affected. + plan.add_indexing_task("indexer-1", make_test_task("index-c", "source-1", 3)); + plan.add_indexing_task("indexer-2", make_test_task("index-d", "source-1", 4)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-a", + "indexer-2", + "index-b", + )], + }; + scheduler.swap_pipelines(request).unwrap(); + + let new_plan = scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(); + let indexer_1_tasks = new_plan.indexer("indexer-1").unwrap(); + let indexer_2_tasks = new_plan.indexer("indexer-2").unwrap(); + + // indexer-1 should have index-c (unchanged) and index-b (swapped in). + assert_eq!(indexer_1_tasks.len(), 2); + let indexer_1_index_ids: Vec<&str> = indexer_1_tasks + .iter() + .map(|t| t.index_uid().index_id.as_str()) + .collect(); + assert!(indexer_1_index_ids.contains(&"index-c")); + assert!(indexer_1_index_ids.contains(&"index-b")); + + // indexer-2 should have index-d (unchanged) and index-a (swapped in). + assert_eq!(indexer_2_tasks.len(), 2); + let indexer_2_index_ids: Vec<&str> = indexer_2_tasks + .iter() + .map(|t| t.index_uid().index_id.as_str()) + .collect(); + assert!(indexer_2_index_ids.contains(&"index-d")); + assert!(indexer_2_index_ids.contains(&"index-a")); + } + + #[tokio::test] + async fn test_swap_pipelines_move_without_swap() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + let task_a = make_test_task("index-a", "source-1", 1); + let task_b = make_test_task("index-b", "source-1", 2); + plan.add_indexing_task("indexer-1", task_a.clone()); + plan.add_indexing_task("indexer-2", task_b.clone()); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + // Move index-a from indexer-1 to indexer-2 without swapping anything back. + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_move_entry("indexer-1", "index-a", "indexer-2")], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert_eq!(response.results.len(), 1); + assert!( + response.results[0].success, + "{}", + response.results[0].reason + ); + + let new_plan = scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(); + + // indexer-1 should have no tasks (index-a was moved away). + let indexer_1_tasks = new_plan.indexer("indexer-1").unwrap(); + assert!(indexer_1_tasks.is_empty()); + + // indexer-2 should have both index-b (unchanged) and index-a (moved in). + let indexer_2_tasks = new_plan.indexer("indexer-2").unwrap(); + assert_eq!(indexer_2_tasks.len(), 2); + let indexer_2_index_ids: Vec<&str> = indexer_2_tasks + .iter() + .map(|t| t.index_uid().index_id.as_str()) + .collect(); + assert!(indexer_2_index_ids.contains(&"index-a")); + assert!(indexer_2_index_ids.contains(&"index-b")); + } + + #[tokio::test] + async fn test_swap_pipelines_move_fresh_pipeline_uids() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + let task_a = make_test_task("index-a", "source-1", 100); + let original_uid_a = task_a.pipeline_uid; + plan.add_indexing_task("indexer-1", task_a); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_move_entry("indexer-1", "index-a", "indexer-2")], + }; + scheduler.swap_pipelines(request).unwrap(); + + let new_plan = scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(); + let moved_a = &new_plan.indexer("indexer-2").unwrap()[0]; + // Pipeline UID must be refreshed after the move. + assert_ne!(moved_a.pipeline_uid, original_uid_a); + assert_eq!(moved_a.index_uid().index_id, "index-a"); + } + + #[tokio::test] + async fn test_swap_pipelines_move_unknown_right_indexer() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&["indexer-1".to_string()]); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_move_entry("indexer-1", "index-a", "indexer-999")], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert!(!response.results[0].success); + assert!(response.results[0].reason.contains("not found")); + } + + #[tokio::test] + async fn test_swap_pipelines_move_preserves_right_node_tasks() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + // indexer-1 has two indexes; only index-a will be moved. + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + plan.add_indexing_task("indexer-1", make_test_task("index-c", "source-1", 3)); + // indexer-2 has index-b which should remain untouched. + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-1", 2)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_move_entry("indexer-1", "index-a", "indexer-2")], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert!( + response.results[0].success, + "{}", + response.results[0].reason + ); + + let new_plan = scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(); + + // indexer-1 should still have index-c (only index-a was moved). + let indexer_1_tasks = new_plan.indexer("indexer-1").unwrap(); + assert_eq!(indexer_1_tasks.len(), 1); + assert_eq!(indexer_1_tasks[0].index_uid().index_id, "index-c"); + + // indexer-2 should have both index-b (unchanged) and index-a (moved in). + let indexer_2_tasks = new_plan.indexer("indexer-2").unwrap(); + assert_eq!(indexer_2_tasks.len(), 2); + let indexer_2_index_ids: Vec<&str> = indexer_2_tasks + .iter() + .map(|t| t.index_uid().index_id.as_str()) + .collect(); + assert!(indexer_2_index_ids.contains(&"index-a")); + assert!(indexer_2_index_ids.contains(&"index-b")); + } + #[test] fn test_get_sources_to_schedule() { let mut model = ControlPlaneModel::default(); diff --git a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs index 63295d61eca..ed256f1fb43 100644 --- a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs +++ b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs @@ -1006,6 +1006,10 @@ impl IngestController { /// Moving a shard consists of closing the shard on the source ingester and opening a new /// one on the target ingester. /// + /// When in maintenance mode (`is_maintenance` is true), this function exits early to keep + /// the indexing plan frozen. This design provides a simple safeguard to prevent unintended + /// plan modifications during maintenance. + /// /// This method is guarded by a lock to ensure that only one rebalance operation is performed at /// a time. pub(crate) async fn rebalance_shards( @@ -1013,7 +1017,11 @@ impl IngestController { model: &mut ControlPlaneModel, mailbox: &Mailbox, progress: &Progress, + is_maintenance: bool, ) -> MetastoreResult>> { + if is_maintenance { + return Ok(None); + } let Ok(rebalance_guard) = self.rebalance_lock.clone().try_lock_owned() else { debug!("skipping rebalance: another rebalance is already in progress"); return Ok(None); @@ -3262,7 +3270,7 @@ mod tests { let progress = Progress::default(); let close_shards_task_opt = controller - .rebalance_shards(&mut model, &control_plane_mailbox, &progress) + .rebalance_shards(&mut model, &control_plane_mailbox, &progress, false) .await .unwrap(); assert!(close_shards_task_opt.is_none()); @@ -3386,7 +3394,7 @@ mod tests { ingester_pool.insert(ingester_id_1.clone(), ingester_1); let close_shards_task = controller - .rebalance_shards(&mut model, &control_plane_mailbox, &progress) + .rebalance_shards(&mut model, &control_plane_mailbox, &progress, false) .await .unwrap() .unwrap(); diff --git a/quickwit/quickwit-control-plane/src/lib.rs b/quickwit/quickwit-control-plane/src/lib.rs index 01072f7de16..0f1bc8275b8 100644 --- a/quickwit/quickwit-control-plane/src/lib.rs +++ b/quickwit/quickwit-control-plane/src/lib.rs @@ -16,6 +16,7 @@ pub mod control_plane; pub mod indexing_plan; pub mod indexing_scheduler; pub mod ingest; +pub mod maintenance; pub(crate) mod metrics; pub(crate) mod model; diff --git a/quickwit/quickwit-control-plane/src/maintenance.rs b/quickwit/quickwit-control-plane/src/maintenance.rs new file mode 100644 index 00000000000..0149dd453df --- /dev/null +++ b/quickwit/quickwit-control-plane/src/maintenance.rs @@ -0,0 +1,512 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Maintenance mode management for the Quickwit control plane. +//! +//! When maintenance mode is enabled: +//! - Metadata mutations (index/source CRUD) are allowed but the indexing plan is not rebuilt. +//! - The indexing plan is frozen: it is not rebuilt when indexers join or leave. +//! - Shard scaling (up/down) and rebalancing are paused. +//! - The frozen plan and maintenance metadata are persisted to the metastore `kv` table so they +//! survive control plane restarts. +//! +//! # Persistence +//! +//! The state is persisted in the metastore `kv` table under the +//! [`KV_KEY_MAINTENANCE_STATE`] key. The value is a JSON envelope with the +//! with some basic metadata and the binary encoded plan. + +use base64::Engine as _; +use prost::Message; +use quickwit_proto::control_plane::{MaintenanceFrozenPlan, MaintenanceFrozenPlanForNode}; +use quickwit_proto::metastore::{ + DeleteKvRequest, GetKvRequest, MetastoreService, MetastoreServiceClient, SetKvRequest, +}; +use serde::{Deserialize, Serialize}; +use time::OffsetDateTime; +use time::format_description::well_known::Rfc3339; +use tracing::info; + +use crate::indexing_plan::PhysicalIndexingPlan; + +/// Key in the metastore `kv` table for the combined maintenance state. +pub const KV_KEY_MAINTENANCE_STATE: &str = "control_plane_maintenance_state"; + +pub const LATEST_MAINTENANCE_FROZEN_PLAN_VERSION: MaintenanceFrozenPlanVersion = + MaintenanceFrozenPlanVersion::V1; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum MaintenanceFrozenPlanVersion { + /// The frozen plan is encoded as protobuf and stored under the + /// "frozen_plan" key as a base64 string. + V1 = 1, +} + +/// Metadata persisted alongside the maintenance mode flag. +/// +/// The `enabled_at` field stores a human-readable RFC 3339 datetime string +/// (e.g., `"2024-06-15T14:30:00Z"`), making it easy to inspect directly in the database. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct MaintenanceModeMetadata { + /// RFC 3339 formatted UTC datetime when maintenance mode was enabled. + enabled_at: String, + /// The version of the maintenance state schema. + version: MaintenanceFrozenPlanVersion, +} + +impl MaintenanceModeMetadata { + /// Creates a new metadata instance with `enabled_at` set to the current UTC time. + pub fn new_now() -> Self { + Self { + enabled_at: now_rfc3339(), + version: LATEST_MAINTENANCE_FROZEN_PLAN_VERSION, + } + } +} + +/// In-memory maintenance mode state for the control plane. +#[derive(Debug, Clone, Default)] +pub struct MaintenanceState { + /// If `Some`, maintenance mode is active with the given metadata. + metadata: Option, +} + +impl MaintenanceState { + /// Returns `true` if maintenance mode is currently active. + pub fn is_active(&self) -> bool { + self.metadata.is_some() + } + + /// Returns the metadata if maintenance mode is active. + pub fn metadata(&self) -> Option<&MaintenanceModeMetadata> { + self.metadata.as_ref() + } + + /// Returns the metadata if maintenance mode is active. + pub fn enabled_at(&self) -> Option { + self.metadata + .as_ref() + .map(|metadata| metadata.enabled_at.clone()) + } + + /// Enables maintenance mode. + /// Returns the metadata that was set. + pub fn enable(&mut self) -> MaintenanceModeMetadata { + let metadata = MaintenanceModeMetadata { + enabled_at: now_rfc3339(), + version: LATEST_MAINTENANCE_FROZEN_PLAN_VERSION, + }; + self.metadata = Some(metadata.clone()); + info!( + enabled_at = %metadata.enabled_at, + version = ?metadata.version, + "maintenance mode enabled" + ); + metadata + } + + /// Disables maintenance mode. + /// Returns `true` if it was previously active. + pub fn disable(&mut self) -> bool { + let was_active = self.metadata.is_some(); + self.metadata = None; + if was_active { + info!("maintenance mode disabled"); + } + was_active + } + + /// Loads maintenance state from persisted metadata. + pub fn load_from_metadata(&mut self, metadata: MaintenanceModeMetadata) { + info!( + enabled_at = %metadata.enabled_at, + "loaded maintenance mode from persisted state" + ); + self.metadata = Some(metadata); + } +} + +// -- Persistence Trait -- + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct MaintenancePersistedState { + pub metadata: MaintenanceModeMetadata, + pub frozen_plan: PhysicalIndexingPlan, +} + +impl MaintenancePersistedState { + pub fn serialize(&self) -> anyhow::Result { + match self.metadata.version { + MaintenanceFrozenPlanVersion::V1 => self.serialize_v1(), + } + } + + pub fn deserialize(encoded: &str) -> anyhow::Result { + let envelope: serde_json::Value = serde_json::from_str(encoded)?; + let metadata: MaintenanceModeMetadata = + serde_json::from_value(envelope["metadata"].clone())?; + let frozen_plan = match metadata.version { + MaintenanceFrozenPlanVersion::V1 => { + Self::deserialize_v1_frozen_plan(envelope["frozen_plan"].as_str().ok_or_else( + || anyhow::anyhow!("missing frozen_plan field in maintenance state"), + )?)? + } + }; + Ok(Self { + metadata, + frozen_plan, + }) + } + + fn deserialize_v1_frozen_plan(encoded: &str) -> anyhow::Result { + let decoded = base64::engine::general_purpose::STANDARD + .decode(encoded) + .map_err(|err| anyhow::anyhow!("failed to base64 decode frozen plan: {err}"))?; + let proto_state = MaintenanceFrozenPlan::decode(&decoded[..]) + .map_err(|err| anyhow::anyhow!("failed to decode protobuf frozen plan: {err}"))?; + + // Collect all indexer node IDs to initialize the plan + let indexer_ids: Vec = proto_state + .state_per_node + .iter() + .map(|node_state| node_state.index_id.clone()) + .collect(); + + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&indexer_ids); + + for node_state in proto_state.state_per_node { + for task in node_state.indexing_tasks { + plan.add_indexing_task(&node_state.index_id, task); + } + } + Ok(plan) + } + + fn serialize_v1(&self) -> anyhow::Result { + let proto_state = self.frozen_plan_to_proto(); + + // Encode the protobuf message to binary + let mut buf = Vec::new(); + prost::Message::encode(&proto_state, &mut buf) + .map_err(|err| anyhow::anyhow!("failed to encode protobuf: {err}"))?; + + // Base64 encode the binary data + let base64_encoded = base64::engine::general_purpose::STANDARD.encode(&buf); + + let json_value = serde_json::json!({ + "frozen_plan": base64_encoded, + "metadata": serde_json::to_value(&self.metadata)?, + }); + Ok(serde_json::to_string(&json_value)?) + } + + /// Converts the frozen plan to the protobuf representation. + fn frozen_plan_to_proto(&self) -> MaintenanceFrozenPlan { + let state_per_node: Vec = self + .frozen_plan + .indexing_tasks_per_indexer() + .iter() + .map(|(node_id, tasks)| MaintenanceFrozenPlanForNode { + index_id: node_id.clone(), + indexing_tasks: tasks.clone(), + }) + .collect(); + + MaintenanceFrozenPlan { state_per_node } + } +} + +/// Persists maintenance state using the metastore's `GetKv`/`SetKv`/`DeleteKv` +/// RPCs to the PostgreSQL `kv` table. +#[derive(Debug, Clone)] +pub struct MetastoreKvPersistence { + metastore: MetastoreServiceClient, +} + +impl MetastoreKvPersistence { + pub fn new(metastore: MetastoreServiceClient) -> Self { + Self { metastore } + } + + /// Loads the maintenance state from persistent storage. Returns `None` if + /// no maintenance state is persisted. + /// + /// Panics if the state can't be fetched or deserialized. + pub async fn load(&self) -> Option { + let response = self + .metastore + .clone() + .get_kv(GetKvRequest { + key: KV_KEY_MAINTENANCE_STATE.to_string(), + }) + .await + .expect("failed to get maintenance state from metastore"); + let encoded = response.value?; // return None if no value is set + let persisted = MaintenancePersistedState::deserialize(&encoded) + .expect("failed to deserialize maintenance state from metastore"); + Some(persisted) + } + + /// Persists the maintenance metadata and frozen plan atomically. + pub async fn save( + &self, + metadata: &MaintenanceModeMetadata, + frozen_plan: &PhysicalIndexingPlan, + ) -> anyhow::Result<()> { + let persisted = MaintenancePersistedState { + metadata: metadata.clone(), + frozen_plan: frozen_plan.clone(), + }; + let serialized = persisted.serialize()?; + self.metastore + .clone() + .set_kv(SetKvRequest { + key: KV_KEY_MAINTENANCE_STATE.to_string(), + value: serialized, + }) + .await?; + Ok(()) + } + + /// Clears all persisted maintenance state. + pub async fn clear(&self) -> anyhow::Result<()> { + self.metastore + .clone() + .delete_kv(DeleteKvRequest { + key: KV_KEY_MAINTENANCE_STATE.to_string(), + }) + .await?; + Ok(()) + } +} + +// -- Helper functions -- + +/// Serializes a `PhysicalIndexingPlan` to a JSON string for use in API responses. +pub fn serialize_frozen_plan(plan: &PhysicalIndexingPlan) -> serde_json::Result { + serde_json::to_string(plan) +} + +/// Returns the current UTC time formatted as an RFC 3339 string. +fn now_rfc3339() -> String { + OffsetDateTime::now_utc() + .format(&Rfc3339) + .expect("formatting OffsetDateTime as RFC 3339 should never fail") +} + +#[cfg(test)] +mod tests { + use quickwit_proto::metastore::{ + EmptyResponse, GetKvResponse, MetastoreServiceClient, MockMetastoreService, + }; + + use super::*; + + #[test] + fn test_maintenance_state_default_is_inactive() { + let state = MaintenanceState::default(); + assert!(!state.is_active()); + assert!(state.metadata().is_none()); + } + + #[test] + fn test_maintenance_state_enable_disable() { + let mut state = MaintenanceState::default(); + + // Enable + let metadata = state.enable(); + assert!(state.is_active()); + assert!(!metadata.enabled_at.is_empty()); + // Should be a valid RFC 3339 datetime + assert!( + OffsetDateTime::parse(&metadata.enabled_at, &Rfc3339).is_ok(), + "enabled_at should be valid RFC 3339: {}", + metadata.enabled_at + ); + + // Disable + let was_active = state.disable(); + assert!(was_active); + assert!(!state.is_active()); + + // Disable again is a no-op + let was_active = state.disable(); + assert!(!was_active); + } + + #[test] + fn test_current_persisted_state_version_roundtrip() { + let metadata = MaintenanceModeMetadata { + enabled_at: "2024-06-15T14:30:00Z".to_string(), + version: LATEST_MAINTENANCE_FROZEN_PLAN_VERSION, + }; + let plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + let state = MaintenancePersistedState { + metadata: metadata.clone(), + frozen_plan: plan.clone(), + }; + let serialized = state + .serialize() + .expect("failed to serialize maintenance state"); + let deserialized: MaintenancePersistedState = + MaintenancePersistedState::deserialize(&serialized).unwrap(); + assert_eq!(deserialized, state); + } + + /// Validates that an existing V1 serialization can still be deserialized. + #[test] + fn test_postcard_v1_deserialization_stability() { + let metadata = MaintenanceModeMetadata { + enabled_at: "2024-06-15T14:30:00Z".to_string(), + version: MaintenanceFrozenPlanVersion::V1, + }; + let plan = PhysicalIndexingPlan::with_indexer_ids(&["indexer-1".to_string()]); + let expected_state = MaintenancePersistedState { + metadata: metadata.clone(), + frozen_plan: plan.clone(), + }; + // // this was used to generate the `encoded` string + // println!( + // "{}", + // expected_state + // .serialize() + // .expect("failed to serialize expected state") + // ); + let encoded = r#"{"frozen_plan":"EgsKCWluZGV4ZXItMQ==","metadata":{"enabled_at":"2024-06-15T14:30:00Z","version":"V1"}}"#; + let deserialized = MaintenancePersistedState::deserialize(encoded).unwrap(); + assert_eq!(deserialized, expected_state); + } + + #[tokio::test] + async fn test_metastore_persistence_save_and_load() { + let mut mock_metastore = MockMetastoreService::new(); + + // Initially empty + mock_metastore + .expect_get_kv() + .times(1) + .returning(|_| Ok(GetKvResponse { value: None })); + + // Save + mock_metastore + .expect_set_kv() + .times(1) + .returning(|_| Ok(EmptyResponse {})); + + // Load + let metadata = MaintenanceModeMetadata { + enabled_at: "2024-01-15T10:00:00Z".to_string(), + version: MaintenanceFrozenPlanVersion::V1, + }; + let plan = PhysicalIndexingPlan::with_indexer_ids(&["indexer-1".to_string()]); + let expected_state = MaintenancePersistedState { + metadata: metadata.clone(), + frozen_plan: plan.clone(), + }; + let expected_encoded = expected_state.serialize().unwrap(); + + mock_metastore.expect_get_kv().times(1).returning(move |_| { + Ok(GetKvResponse { + value: Some(expected_encoded.clone()), + }) + }); + + // Clear + mock_metastore + .expect_delete_kv() + .times(1) + .returning(|_| Ok(EmptyResponse {})); + + // One final load to verify cleared + mock_metastore + .expect_get_kv() + .times(1) + .returning(|_| Ok(GetKvResponse { value: None })); + + let metastore_client = MetastoreServiceClient::from_mock(mock_metastore); + let persistence = MetastoreKvPersistence::new(metastore_client); + + // Initially empty + let loaded = persistence.load().await; + assert!(loaded.is_none()); + + // Save + persistence.save(&metadata, &plan).await.unwrap(); + + // Load + let loaded = persistence.load().await.unwrap(); + assert_eq!(loaded.metadata, metadata); + assert_eq!(loaded.frozen_plan, plan); + + // Clear + persistence.clear().await.unwrap(); + let loaded = persistence.load().await; + assert!(loaded.is_none()); + } + + #[tokio::test] + async fn test_metastore_persistence_overwrite() { + let mut mock_metastore = MockMetastoreService::new(); + + let metadata1 = MaintenanceModeMetadata { + enabled_at: "2024-01-01T00:00:00Z".to_string(), + version: MaintenanceFrozenPlanVersion::V1, + }; + let plan1 = PhysicalIndexingPlan::with_indexer_ids(&["a".to_string()]); + + let metadata2 = MaintenanceModeMetadata { + enabled_at: "2024-06-01T12:00:00Z".to_string(), + version: MaintenanceFrozenPlanVersion::V1, + }; + let plan2 = PhysicalIndexingPlan::with_indexer_ids(&["b".to_string()]); + + // First save + mock_metastore + .expect_set_kv() + .times(1) + .returning(|_| Ok(EmptyResponse {})); + + // Second save (overwrite) + mock_metastore + .expect_set_kv() + .times(1) + .returning(|_| Ok(EmptyResponse {})); + + // Load - return the second state + let expected_state2 = MaintenancePersistedState { + metadata: metadata2.clone(), + frozen_plan: plan2.clone(), + }; + let expected_encoded2 = expected_state2.serialize().unwrap(); + + mock_metastore.expect_get_kv().times(1).returning(move |_| { + Ok(GetKvResponse { + value: Some(expected_encoded2.clone()), + }) + }); + + let metastore_client = MetastoreServiceClient::from_mock(mock_metastore); + let persistence = MetastoreKvPersistence::new(metastore_client); + + persistence.save(&metadata1, &plan1).await.unwrap(); + persistence.save(&metadata2, &plan2).await.unwrap(); + + let loaded = persistence.load().await.unwrap(); + assert_eq!(loaded.metadata, metadata2); + assert_eq!(loaded.frozen_plan, plan2); + } +} diff --git a/quickwit/quickwit-control-plane/src/metrics.rs b/quickwit/quickwit-control-plane/src/metrics.rs index 5e534c4f176..c3370d9b3a7 100644 --- a/quickwit/quickwit-control-plane/src/metrics.rs +++ b/quickwit/quickwit-control-plane/src/metrics.rs @@ -42,6 +42,9 @@ pub struct ControlPlaneMetrics { // Indexing plan metrics. pub local_shards: IntGauge, pub remote_shards: IntGauge, + + // Maintenance mode. + pub maintenance_mode: IntGauge, } impl ControlPlaneMetrics { @@ -128,6 +131,12 @@ impl Default for ControlPlaneMetrics { ), local_shards, remote_shards, + maintenance_mode: new_gauge( + "maintenance_mode", + "Whether the control plane is in maintenance mode (1 = enabled, 0 = disabled).", + "control_plane", + &[], + ), } } } diff --git a/quickwit/quickwit-control-plane/src/tests.rs b/quickwit/quickwit-control-plane/src/tests.rs index 9f0cd97b477..617ee9abcd1 100644 --- a/quickwit/quickwit-control-plane/src/tests.rs +++ b/quickwit/quickwit-control-plane/src/tests.rs @@ -29,7 +29,8 @@ use quickwit_indexing::IndexingService; use quickwit_metastore::{IndexMetadata, ListIndexesMetadataResponseExt}; use quickwit_proto::indexing::{ApplyIndexingPlanRequest, CpuCapacity, IndexingServiceClient}; use quickwit_proto::metastore::{ - ListIndexesMetadataResponse, ListShardsResponse, MetastoreServiceClient, MockMetastoreService, + GetKvResponse, ListIndexesMetadataResponse, ListShardsResponse, MetastoreServiceClient, + MockMetastoreService, }; use quickwit_proto::types::NodeId; use serde_json::json; @@ -121,6 +122,9 @@ async fn start_control_plane( subresponses: Vec::new(), }) }); + mock_metastore + .expect_get_kv() + .returning(|_| Ok(GetKvResponse { value: None })); let mut indexer_inboxes = Vec::new(); let indexer_pool = Pool::default(); @@ -178,7 +182,7 @@ async fn test_scheduler_scheduling_and_control_loop_apply_plan_again() { indexing_service_inbox.drain_for_test_typed::(); assert_eq!(scheduler_state.num_applied_physical_indexing_plan, 1); assert_eq!(scheduler_state.num_schedule_indexing_plan, 1); - assert!(scheduler_state.last_applied_physical_plan.is_some()); + assert!(scheduler_state.current_targeted_physical_plan.is_some()); assert_eq!(indexing_service_inbox_messages.len(), 1); // After a CONTROL_PLAN_LOOP_INTERVAL, the control loop will check if the desired plan is @@ -266,7 +270,7 @@ async fn test_scheduler_scheduling_no_indexer() { .indexing_scheduler; assert_eq!(scheduler_state.num_applied_physical_indexing_plan, 0); assert_eq!(scheduler_state.num_schedule_indexing_plan, 0); - assert!(scheduler_state.last_applied_physical_plan.is_none()); + assert!(scheduler_state.current_targeted_physical_plan.is_none()); // There is no indexer, we should observe no // scheduling. @@ -278,7 +282,7 @@ async fn test_scheduler_scheduling_no_indexer() { .indexing_scheduler; assert_eq!(scheduler_state.num_applied_physical_indexing_plan, 0); assert_eq!(scheduler_state.num_schedule_indexing_plan, 0); - assert!(scheduler_state.last_applied_physical_plan.is_none()); + assert!(scheduler_state.current_targeted_physical_plan.is_none()); universe.assert_quit().await; } @@ -324,7 +328,7 @@ async fn test_scheduler_scheduling_multiple_indexers() { indexing_service_inbox_1.drain_for_test_typed::(); assert_eq!(scheduler_state.num_applied_physical_indexing_plan, 0); assert_eq!(scheduler_state.num_schedule_indexing_plan, 0); - assert!(scheduler_state.last_applied_physical_plan.is_none()); + assert!(scheduler_state.current_targeted_physical_plan.is_none()); assert_eq!(indexing_service_inbox_messages.len(), 0); cluster diff --git a/quickwit/quickwit-datetime/src/java_date_time_format.rs b/quickwit/quickwit-datetime/src/java_date_time_format.rs index 2ef63f32881..a0d6c1cb0f5 100644 --- a/quickwit/quickwit-datetime/src/java_date_time_format.rs +++ b/quickwit/quickwit-datetime/src/java_date_time_format.rs @@ -261,14 +261,17 @@ fn resolve_java_datetime_format_alias(java_datetime_format: &str) -> &str { OnceLock::new(); let java_datetime_format_map = JAVA_DATE_FORMAT_ALIASES.get_or_init(|| { let mut m = HashMap::new(); - m.insert("date_optional_time", "yyyy-MM-dd['T'HH:mm:ss.SSSZ]"); + m.insert( + "date_optional_time", + "yyyy[-MM[-dd['T'HH[:mm[:ss[.SSS][Z]]]]]]", + ); m.insert( "strict_date_optional_time", - "yyyy[-MM[-dd['T'HH[:mm[:ss[.SSS[Z]]]]]]]", + "yyyy[-MM[-dd['T'HH[:mm[:ss[.SSS][Z]]]]]]", ); m.insert( "strict_date_optional_time_nanos", - "yyyy[-MM[-dd['T'HH:mm:ss.SSSSSSZ]]]", + "yyyy[-MM[-dd['T'HH[:mm[:ss[.SSSSSS][Z]]]]]]", ); m.insert("basic_date", "yyyyMMdd"); @@ -660,6 +663,7 @@ mod tests { "2019-03-23T21:35:46.123+00:00", "2019-03-23T21:36:46.123+03:00", "2019-03-23T21:37:46.123+0300", + "2019-03-23T21:38:46+00:00", ]; let expected = [ datetime!(2019-01-01 00:00:00 UTC), @@ -671,6 +675,7 @@ mod tests { datetime!(2019-03-23 21:35:46.123 UTC), datetime!(2019-03-23 21:36:46.123 +03:00:00), datetime!(2019-03-23 21:37:46.123 +03:00:00), + datetime!(2019-03-23 21:38:46 UTC), ]; for (date_str, &expected_dt) in dates.iter().zip(expected.iter()) { let parsed_dt = parser @@ -692,6 +697,7 @@ mod tests { "2019-03-23T21:35:46.123456789+00:00", "2019-03-23T21:36:46.123456789+03:00", "2019-03-23T21:37:46.123456789+0300", + "2019-03-23T21:38:46+00:00", ]; let expected = [ datetime!(2019-01-01 00:00:00 UTC), @@ -701,6 +707,7 @@ mod tests { datetime!(2019-03-23 21:35:46.123456789 UTC), datetime!(2019-03-23 21:36:46.123456789 +03:00:00), datetime!(2019-03-23 21:37:46.123456789 +03:00:00), + datetime!(2019-03-23 21:38:46 UTC), ]; for (date_str, &expected_dt) in dates.iter().zip(expected.iter()) { let parsed_dt = parser diff --git a/quickwit/quickwit-doc-mapper/Cargo.toml b/quickwit/quickwit-doc-mapper/Cargo.toml index ae0239e53c5..c3eadcd4bd1 100644 --- a/quickwit/quickwit-doc-mapper/Cargo.toml +++ b/quickwit/quickwit-doc-mapper/Cargo.toml @@ -25,6 +25,7 @@ serde_json = { workspace = true } serde_json_borrow = { workspace = true } siphasher = { workspace = true } tantivy = { workspace = true } +tantivy-fst = { workspace = true } thiserror = { workspace = true } tracing = { workspace = true } utoipa = { workspace = true } @@ -42,10 +43,9 @@ serde_yaml = { workspace = true } time = { workspace = true } quickwit-common = { workspace = true, features = ["testsuite"] } -quickwit-query = { workspace = true, features = ["multilang"] } +quickwit-query = { workspace = true } [features] -multilang = ["quickwit-query/multilang"] testsuite = [] [[bench]] diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs index 1eb2cea02d9..1b5dc19b12e 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs @@ -78,6 +78,8 @@ pub struct DocMapper { timestamp_field_path: Option>, /// Secondary timestamp field name. secondary_timestamp_field_name: Option, + /// Indexation time field name. + indexation_time_field_name: Option, /// Root node of the field mapping tree. /// See [`MappingNode`]. field_mappings: MappingNode, @@ -128,6 +130,31 @@ fn validate_timestamp_field( Ok(()) } +fn validate_indexation_time_field( + indexation_field_path: &str, + mapping_root_node: &MappingNode, +) -> anyhow::Result<()> { + if indexation_field_path.starts_with('.') || indexation_field_path.starts_with("\\.") { + bail!("indexation_time field `{indexation_field_path}` should not start with a `.`"); + } + if indexation_field_path.ends_with('.') { + bail!("indexation_time field `{indexation_field_path}` should not end with a `.`"); + } + let Some(indexation_time_field_type) = + mapping_root_node.find_field_mapping_type(indexation_field_path) + else { + bail!("could not find indexation_time field `{indexation_field_path}` in field mappings"); + }; + if let FieldMappingType::DateTime(_, cardinality) = &indexation_time_field_type { + if cardinality != &Cardinality::SingleValued { + bail!("indexation_time field `{indexation_field_path}` should be single-valued"); + } + } else { + bail!("indexation_time field `{indexation_field_path}` should be a datetime field"); + } + Ok(()) +} + impl From for DocMapperBuilder { fn from(default_doc_mapper: DocMapper) -> Self { let partition_key_str = default_doc_mapper.partition_key.to_string(); @@ -142,6 +169,7 @@ impl From for DocMapperBuilder { field_mappings: default_doc_mapper.field_mappings.into(), timestamp_field: default_doc_mapper.timestamp_field_name, secondary_timestamp_field: default_doc_mapper.secondary_timestamp_field_name, + indexation_time_field: default_doc_mapper.indexation_time_field_name, tag_fields: default_doc_mapper.tag_field_names, partition_key: partition_key_opt, max_num_partitions: default_doc_mapper.max_num_partitions, @@ -203,6 +231,9 @@ impl TryFrom for DocMapper { } else { None }; + if let Some(indexation_time_field_name) = &doc_mapping.indexation_time_field { + validate_indexation_time_field(indexation_time_field_name, &field_mappings)?; + } let schema = schema_builder.build(); let tokenizer_manager = create_default_quickwit_tokenizer_manager(); @@ -293,6 +324,7 @@ impl TryFrom for DocMapper { timestamp_field_name: doc_mapping.timestamp_field, timestamp_field_path, secondary_timestamp_field_name: doc_mapping.secondary_timestamp_field, + indexation_time_field_name: doc_mapping.indexation_time_field, field_mappings, concatenate_dynamic_fields, tag_field_names, @@ -681,6 +713,11 @@ impl DocMapper { self.secondary_timestamp_field_name.as_deref() } + /// Returns the indexation time field name. + pub fn indexation_time_field_name(&self) -> Option<&str> { + self.indexation_time_field_name.as_deref() + } + /// Returns the tag `NameField`s on the current schema. /// Returns an error if a tag field is not found in this schema. pub fn tag_named_fields(&self) -> anyhow::Result> { diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs index ae3388aee32..e69d337a616 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs @@ -1152,7 +1152,7 @@ mod tests { "type": "text", "stored": true, "record": "basic", - "tokenizer": "en_stem" + "tokenizer": "lowercase" } "#, )?; @@ -1161,7 +1161,7 @@ mod tests { FieldMappingType::Text(options, _) => { assert_eq!(options.stored, true); let indexing_options = options.indexing_options.unwrap(); - assert_eq!(indexing_options.tokenizer.name(), "en_stem"); + assert_eq!(indexing_options.tokenizer.name(), "lowercase"); assert_eq!(indexing_options.record, IndexRecordOption::Basic); } _ => panic!("wrong property type"), diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index bed4b18b90f..370674c9536 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -26,10 +26,9 @@ use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::ops::Bound; +use anyhow::bail; pub use doc_mapper_builder::DocMapperBuilder; pub use doc_mapper_impl::DocMapper; -#[cfg(all(test, feature = "multilang"))] -pub(crate) use field_mapping_entry::TextIndexingOptions; pub use field_mapping_entry::{ BinaryFormat, FastFieldOptions, FieldMappingEntry, QuickwitBytesOptions, QuickwitJsonOptions, QuickwitTextNormalizer, @@ -43,6 +42,7 @@ pub use field_mapping_type::FieldMappingType; use serde_json::Value as JsonValue; use tantivy::Term; use tantivy::schema::{Field, FieldType}; +use tantivy_fst::Automaton as TantivyFstAutomaton; pub(crate) use tokenizer_entry::{ NgramTokenizerOption, RegexTokenizerOption, TokenFilterType, TokenizerType, }; @@ -78,10 +78,70 @@ pub struct TermRange { #[derive(Debug, Clone, PartialEq, Eq, Hash)] /// Supported automaton types to warmup pub enum Automaton { - /// A regex in it's str representation as tantivy_fst::Regex isn't PartialEq, and the path if + /// A regex in its str representation as tantivy_fst::Regex isn't PartialEq, and the path if /// inside a json field Regex(Option>, String), - // we could add termset query here, instead of downloading the whole dictionary + /// An exact-match automaton for a TermSet query. + TermSet(ExactSetAutomaton), +} + +/// A byte-level DFA that accepts exactly the strings in a sorted, deduplicated byte-sequence +/// set. State = `(depth, lo, hi)` meaning all terms in `self.terms[lo..hi]` share the first +/// `depth` bytes consumed so far. Transitions are computed via binary search, avoiding any +/// upfront DFA materialisation. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ExactSetAutomaton { + /// Holds sorted, deduplicated `term.serialized_value_bytes()` for all terms in the set. + /// Using `warm_postings_automaton` coalesces both the SSTable lookup and the postings + /// downloads into a small number of merged range requests. + terms: Vec>, +} + +impl ExactSetAutomaton { + /// Create an `ExactSetAutomaton` from an iterator of terms. + pub fn try_from_terms<'a>(terms: impl IntoIterator) -> anyhow::Result { + let mut sorted_bytes: Vec> = terms + .into_iter() + .map(|term| term.serialized_value_bytes().to_vec()) + .collect(); + if sorted_bytes.is_empty() { + bail!("Cannot create an ExactSetAutomaton from an empty set of terms"); + } + sorted_bytes.sort(); + sorted_bytes.dedup(); + Ok(ExactSetAutomaton { + terms: sorted_bytes, + }) + } +} + +impl TantivyFstAutomaton for ExactSetAutomaton { + /// (depth, lo, hi) + type State = (usize, usize, usize); + + fn start(&self) -> Self::State { + (0, 0, self.terms.len()) + } + + fn is_match(&self, &(depth, lo, hi): &Self::State) -> bool { + lo < hi && self.terms[lo].len() == depth + } + + fn can_match(&self, &(_, lo, hi): &Self::State) -> bool { + lo < hi + } + + fn accept(&self, &(depth, lo, hi): &Self::State, byte: u8) -> Self::State { + // Within [lo, hi), terms are sorted by their bytes. Terms of length == depth (exact + // matches) sort before any extension, so there is at most one such term at index lo. + // Skip it — it has no byte at position `depth`. + let lo = lo + usize::from(lo < hi && self.terms[lo].len() == depth); + // Binary-search for the sub-range where terms[i][depth] == byte. + // All remaining terms in [lo, hi) have length > depth, so indexing [depth] is safe. + let new_lo = lo + self.terms[lo..hi].partition_point(|t| t[depth] < byte); + let new_hi = new_lo + self.terms[new_lo..hi].partition_point(|t| t[depth] <= byte); + (depth + 1, new_lo, new_hi) + } } /// Description of how a fast field should be warmed up @@ -97,9 +157,6 @@ pub struct FastFieldWarmupInfo { /// running the query. #[derive(Debug, Default, Clone, PartialEq, Eq)] pub struct WarmupInfo { - /// Name of fields from the term dictionary and posting list which needs to - /// be entirely loaded - pub term_dict_fields: HashSet, /// Fast fields which needs to be loaded pub fast_fields: HashSet, /// Whether to warmup field norms. Used mostly for scoring. @@ -115,7 +172,6 @@ pub struct WarmupInfo { impl WarmupInfo { /// Merge other WarmupInfo into self. pub fn merge(&mut self, other: WarmupInfo) { - self.term_dict_fields.extend(other.term_dict_fields); self.field_norms |= other.field_norms; for fast_field_warmup_info in other.fast_fields.into_iter() { @@ -153,21 +209,6 @@ impl WarmupInfo { /// Simplify a WarmupInfo, removing some redundant tasks pub fn simplify(&mut self) { - self.terms_grouped_by_field.retain(|field, terms| { - if self.term_dict_fields.contains(field) { - // we are already about to full-load this dictionary. We only care about terms - // which needs additional position - terms.retain(|_term, include_position| *include_position); - } - // if no term is left, remove the entry from the hashmap - !terms.is_empty() - }); - self.term_ranges_grouped_by_field.retain(|field, terms| { - if self.term_dict_fields.contains(field) { - terms.retain(|_term, include_position| *include_position); - } - !terms.is_empty() - }); // TODO we could remove from terms_grouped_by_field for ranges with no `limit` in // term_ranges_grouped_by_field } @@ -624,13 +665,6 @@ mod tests { .collect() } - fn hashset_field(elements: &[u32]) -> HashSet { - elements - .iter() - .map(|elem| Field::from_field_id(*elem)) - .collect() - } - fn hashmap(elements: &[(u32, &str, bool)]) -> HashMap> { let mut result: HashMap> = HashMap::new(); for (field, term, pos) in elements { @@ -665,7 +699,6 @@ mod tests { #[test] fn test_warmup_info_merge() { let wi_base = WarmupInfo { - term_dict_fields: hashset_field(&[1, 2]), fast_fields: hashset_fast(&["fast1", "fast2"]), field_norms: false, terms_grouped_by_field: hashmap(&[(1, "term1", false), (1, "term2", false)]), @@ -688,7 +721,6 @@ mod tests { let mut wi_base = wi_base; let wi_2 = WarmupInfo { - term_dict_fields: hashset_field(&[2, 3]), fast_fields: hashset_fast(&["fast2", "fast3"]), field_norms: true, terms_grouped_by_field: hashmap(&[(2, "term1", false), (1, "term2", true)]), @@ -705,7 +737,6 @@ mod tests { }; wi_base.merge(wi_2.clone()); - assert_eq!(wi_base.term_dict_fields, hashset_field(&[1, 2, 3])); assert_eq!( wi_base.fast_fields, hashset_fast(&["fast1", "fast2", "fast3"]) @@ -771,7 +802,6 @@ mod tests { #[test] fn test_warmup_info_simplify() { let mut warmup_info = WarmupInfo { - term_dict_fields: hashset_field(&[1]), fast_fields: hashset_fast(&["fast1", "fast2"]), field_norms: false, terms_grouped_by_field: hashmap(&[ @@ -793,11 +823,15 @@ mod tests { .collect(), }; let expected = WarmupInfo { - term_dict_fields: hashset_field(&[1]), fast_fields: hashset_fast(&["fast1", "fast2"]), field_norms: false, - terms_grouped_by_field: hashmap(&[(1, "term2", true), (2, "term3", false)]), + terms_grouped_by_field: hashmap(&[ + (1, "term1", false), + (1, "term2", true), + (2, "term3", false), + ]), term_ranges_grouped_by_field: hashmap_ranges(&[ + (1, "term1", false), (1, "term2", true), (2, "term3", false), ]), @@ -812,55 +846,4 @@ mod tests { warmup_info.simplify(); assert_eq!(warmup_info, expected); } - - #[test] - #[cfg(feature = "multilang")] - fn test_doc_mapper_query_with_multilang_field() { - use quickwit_query::query_ast::TermQuery; - use tantivy::schema::IndexRecordOption; - - use crate::doc_mapper::{ - QuickwitTextOptions, QuickwitTextTokenizer, TextIndexingOptions, TokenizerType, - }; - use crate::{TokenizerConfig, TokenizerEntry}; - let mut doc_mapper_builder = DocMapperBuilder::default(); - doc_mapper_builder - .doc_mapping - .field_mappings - .push(FieldMappingEntry { - name: "multilang".to_string(), - mapping_type: FieldMappingType::Text( - QuickwitTextOptions { - indexing_options: Some(TextIndexingOptions { - tokenizer: QuickwitTextTokenizer::from_static("multilang"), - record: IndexRecordOption::Basic, - fieldnorms: false, - }), - ..Default::default() - }, - Cardinality::SingleValued, - ), - }); - doc_mapper_builder - .doc_mapping - .tokenizers - .push(TokenizerEntry { - name: "multilang".to_string(), - config: TokenizerConfig { - tokenizer_type: TokenizerType::Multilang, - filters: Vec::new(), - }, - }); - let doc_mapper = doc_mapper_builder.try_build().unwrap(); - let schema = doc_mapper.schema(); - let query_ast = quickwit_query::query_ast::QueryAst::Term(TermQuery { - field: "multilang".to_string(), - value: "JPN:す".to_string(), - }); - let (query, _) = doc_mapper.query(schema, query_ast, false, None).unwrap(); - assert_eq!( - format!("{query:?}"), - r#"TermQuery(Term(field=2, type=Str, "JPN:す"))"# - ); - } } diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs index b9793dc9548..0488d118c9f 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs @@ -44,10 +44,6 @@ impl TokenizerConfig { pub fn text_analyzer(&self) -> anyhow::Result { let mut text_analyzer_builder = match &self.tokenizer_type { TokenizerType::Simple => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(), - #[cfg(any(test, feature = "multilang"))] - TokenizerType::Multilang => { - TextAnalyzer::builder(quickwit_query::MultiLangTokenizer::default()).dynamic() - } TokenizerType::SourceCode => TextAnalyzer::builder(CodeTokenizer::default()).dynamic(), TokenizerType::Ngram(options) => { let tokenizer = @@ -120,8 +116,6 @@ impl TokenFilterType { #[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, utoipa::ToSchema)] #[serde(tag = "type", rename_all = "snake_case")] pub enum TokenizerType { - #[cfg(any(test, feature = "multilang"))] - Multilang, Ngram(NgramTokenizerOption), Regex(RegexTokenizerOption), Simple, diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapping.rs b/quickwit/quickwit-doc-mapper/src/doc_mapping.rs index d8afa4b16e9..8fc1ce8096a 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapping.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapping.rs @@ -133,6 +133,13 @@ pub struct DocMapping { #[serde(skip_serializing_if = "Option::is_none")] pub secondary_timestamp_field: Option, + /// Declares the field which will contain the indexation time for the document. + /// This field is automatically populated by the indexer + /// with the time at which the document is indexed. + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub indexation_time_field: Option, + /// Declares the low cardinality fields for which the values ​​are recorded directly in the /// splits metadata. #[schema(value_type = Vec)] @@ -207,6 +214,7 @@ mod tests { ], timestamp_field: Some("timestamp".to_string()), secondary_timestamp_field: None, + indexation_time_field: None, tag_fields: BTreeSet::from_iter(["level".to_string()]), partition_key: Some("tenant_id".to_string()), max_num_partitions: NonZeroU32::new(100).unwrap(), diff --git a/quickwit/quickwit-doc-mapper/src/lib.rs b/quickwit/quickwit-doc-mapper/src/lib.rs index 8dee8d700ed..1b5a67908e3 100644 --- a/quickwit/quickwit-doc-mapper/src/lib.rs +++ b/quickwit/quickwit-doc-mapper/src/lib.rs @@ -30,9 +30,9 @@ mod routing_expression; pub mod tag_pruning; pub use doc_mapper::{ - Automaton, BinaryFormat, DocMapper, DocMapperBuilder, FastFieldWarmupInfo, FieldMappingEntry, - FieldMappingType, JsonObject, NamedField, QuickwitBytesOptions, QuickwitJsonOptions, TermRange, - TokenizerConfig, TokenizerEntry, WarmupInfo, analyze_text, + Automaton, BinaryFormat, DocMapper, DocMapperBuilder, ExactSetAutomaton, FastFieldWarmupInfo, + FieldMappingEntry, FieldMappingType, JsonObject, NamedField, QuickwitBytesOptions, + QuickwitJsonOptions, TermRange, TokenizerConfig, TokenizerEntry, WarmupInfo, analyze_text, }; use doc_mapper::{ FastFieldOptions, FieldMappingEntryForSerialization, IndexRecordOptionSchema, diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index 5900b577795..38d4bab60b1 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -30,7 +30,7 @@ use tantivy::schema::{Field, Schema}; use tracing::error; use crate::doc_mapper::FastFieldWarmupInfo; -use crate::{Automaton, QueryParserError, TermRange, WarmupInfo}; +use crate::{Automaton, ExactSetAutomaton, QueryParserError, TermRange, WarmupInfo}; #[derive(Default)] struct RangeQueryFields { @@ -198,8 +198,7 @@ pub(crate) fn build_query( let query = query_ast.build_tantivy_query(context)?; - let term_set_query_fields = extract_term_set_query_fields(&query_ast, context.schema)?; - let (term_ranges_grouped_by_field, automatons_grouped_by_field) = + let (term_ranges_grouped_by_field, mut automatons_grouped_by_field) = extract_prefix_term_ranges_and_automaton( &query_ast, context.schema, @@ -219,8 +218,13 @@ pub(crate) fn build_query( .or_default() |= need_position; }); + coalesce_multi_term_fields_into_automatons( + &mut terms_grouped_by_field, + &mut automatons_grouped_by_field, + 2, + )?; + let warmup_info = WarmupInfo { - term_dict_fields: term_set_query_fields, terms_grouped_by_field, term_ranges_grouped_by_field, fast_fields, @@ -231,44 +235,54 @@ pub(crate) fn build_query( Ok((query, warmup_info)) } -struct ExtractTermSetFields<'a> { - term_dict_fields_to_warm_up: HashSet, - schema: &'a Schema, -} - -impl<'a> ExtractTermSetFields<'a> { - fn new(schema: &'a Schema) -> Self { - ExtractTermSetFields { - term_dict_fields_to_warm_up: HashSet::new(), - schema, +/// For any field with more than `term_threshold` non-positional terms, moves +/// those terms into an `Automaton::TermSet` and removes them from +/// `terms_grouped_by_field`. +/// +/// This enables `warm_postings_automaton` to coalesce both the SSTable block +/// fetches and the postings downloads into a small number of merged range +/// requests, instead of N individual per-term requests. +/// +/// A minimum of `term_threshold` terms is required because +/// `warm_postings_automaton` has higher per-call overhead than a direct point +/// lookup: spawning a CPU task and traversing the sstable twice. That overhead +/// is only worth paying when there are enough terms to coalesce. +/// +/// Terms that require positions are left in `terms_grouped_by_field` unchanged, +/// as they must be fetched individually. +/// +/// TODO: should positional terms also support some form of grouping? +fn coalesce_multi_term_fields_into_automatons( + terms_grouped_by_field: &mut HashMap>, + automatons_grouped_by_field: &mut HashMap>, + term_threshold: usize, +) -> anyhow::Result<()> { + let fields: Vec = terms_grouped_by_field.keys().copied().collect(); + for field in fields { + let no_pos_terms: Vec<&Term> = terms_grouped_by_field + .get(&field) + .unwrap() + .iter() + .filter(|(_, need_pos)| !**need_pos) + .map(|(term, _)| term) + .collect(); + if no_pos_terms.len() <= term_threshold { + continue; } - } -} - -impl<'a> QueryAstVisitor<'a> for ExtractTermSetFields<'_> { - type Err = anyhow::Error; - - fn visit_term_set(&mut self, term_set_query: &'a TermSetQuery) -> anyhow::Result<()> { - for field in term_set_query.terms_per_field.keys() { - if let Some((field, _field_entry, _path)) = - find_field_or_hit_dynamic(field, self.schema) - { - self.term_dict_fields_to_warm_up.insert(field); - } else { - anyhow::bail!("field does not exist: {}", field); - } + let automaton = ExactSetAutomaton::try_from_terms(no_pos_terms)?; + automatons_grouped_by_field + .entry(field) + .or_default() + .insert(Automaton::TermSet(automaton)); + // Remove the no-position terms: the automaton covers their SSTable lookup + postings. + // Terms still needing positions are kept for warm_up_terms. + let field_terms = terms_grouped_by_field.get_mut(&field).unwrap(); + field_terms.retain(|_, need_pos| *need_pos); + if field_terms.is_empty() { + terms_grouped_by_field.remove(&field); } - Ok(()) } -} - -fn extract_term_set_query_fields( - query_ast: &QueryAst, - schema: &Schema, -) -> anyhow::Result> { - let mut visitor = ExtractTermSetFields::new(schema); - visitor.visit(query_ast)?; - Ok(visitor.term_dict_fields_to_warm_up) + Ok(()) } /// Converts a `prefix` term into the equivalent term range. @@ -437,7 +451,7 @@ mod test { use tantivy::schema::{DateOptions, DateTimePrecision, FAST, INDEXED, STORED, Schema, TEXT}; use super::{ExtractPrefixTermRanges, build_query}; - use crate::{DYNAMIC_FIELD_NAME, SOURCE_FIELD_NAME, TermRange}; + use crate::{Automaton, DYNAMIC_FIELD_NAME, SOURCE_FIELD_NAME, TermRange}; enum TestExpectation<'a> { Err(&'a str), @@ -881,26 +895,96 @@ mod test { #[test] fn test_build_query_warmup_info() { - let query_with_set = query_ast_from_user_text("desc: IN [hello]", None) + let query_with_set = query_ast_from_user_text("desc: IN [alpha beta gamma delta]", None) + .parse_user_query(&[]) + .unwrap(); + let query_with_small_set = query_ast_from_user_text("desc: IN [beta]", None) .parse_user_query(&[]) .unwrap(); - let query_without_set = query_ast_from_user_text("desc:hello", None) + let query_with_many_terms = + query_ast_from_user_text("desc:(hello OR world OR extra OR big)", None) + .parse_user_query(&[]) + .unwrap(); + let query_with_single_term = query_ast_from_user_text("desc:hello", None) .parse_user_query(&[]) .unwrap(); let schema = make_schema(true); let context = BuildTantivyAstContext::for_test(&schema); - let (_, warmup_info) = build_query(query_with_set, &context, None).unwrap(); - assert_eq!(warmup_info.term_dict_fields.len(), 1); + for query in [query_with_many_terms, query_with_set] { + let (_, warmup_info) = build_query(query, &context, None).unwrap(); + assert!(warmup_info.terms_grouped_by_field.is_empty()); + assert_eq!(warmup_info.automatons_grouped_by_field.len(), 1); + let automatons = warmup_info + .automatons_grouped_by_field + .values() + .next() + .unwrap(); + assert_eq!(automatons.len(), 1); + assert!(matches!( + automatons.iter().next().unwrap(), + Automaton::TermSet(_) + )); + } + + for query in [query_with_small_set, query_with_single_term] { + let (_, warmup_info) = build_query(query, &context, None).unwrap(); + assert!(warmup_info.automatons_grouped_by_field.is_empty()); + } + } + + #[test] + fn test_build_query_warmup_info_term_set_with_other_queries() { + // Verify that: + // - fields with >= 3 non-positional terms are coalesced into an automaton + // - positional terms on the same field remain in terms_grouped_by_field + // - fields with fewer than 3 non-positional terms are unaffected + let query_ast = query_ast_from_user_text( + r#"desc: IN [alpha beta gamma] AND desc:"world extra" AND title:baz"#, + None, + ) + .parse_user_query(&[]) + .unwrap(); + + let schema = make_schema(false); + let context = BuildTantivyAstContext::for_test(&schema); + let (_, warmup_info) = build_query(query_ast, &context, None).unwrap(); + + let desc_field = schema.get_field("desc").unwrap(); + let title_field = schema.get_field("title").unwrap(); + + // desc: 3 non-positional terms (alpha, beta, gamma) are coalesced into an automaton + let desc_automatons = warmup_info + .automatons_grouped_by_field + .get(&desc_field) + .expect("desc should have an automaton"); + assert_eq!(desc_automatons.len(), 1); + assert!(matches!( + desc_automatons.iter().next().unwrap(), + Automaton::TermSet(_) + )); + + // desc: phrase terms "world" and "extra" stay as positional terms + let desc_terms = warmup_info + .terms_grouped_by_field + .get(&desc_field) + .expect("desc positional terms should still be present"); + assert_eq!(desc_terms.len(), 2); + assert!(desc_terms.values().all(|&need_pos| need_pos)); + + // title: only 1 non-positional term (below threshold), stays in terms_grouped_by_field assert!( - warmup_info - .term_dict_fields - .contains(&tantivy::schema::Field::from_field_id(2)) + !warmup_info + .automatons_grouped_by_field + .contains_key(&title_field) ); - - let (_, warmup_info) = build_query(query_without_set, &context, None).unwrap(); - assert!(warmup_info.term_dict_fields.is_empty()); + let title_terms = warmup_info + .terms_grouped_by_field + .get(&title_field) + .expect("title terms should be present"); + assert_eq!(title_terms.len(), 1); + assert!(title_terms.values().all(|&need_pos| !need_pos)); } #[test] diff --git a/quickwit/quickwit-indexing/src/actors/indexer.rs b/quickwit/quickwit-indexing/src/actors/indexer.rs index 64a08d3f5da..f4674ea84ba 100644 --- a/quickwit/quickwit-indexing/src/actors/indexer.rs +++ b/quickwit/quickwit-indexing/src/actors/indexer.rs @@ -44,6 +44,7 @@ use tantivy::schema::{Field, Schema, Value}; use tantivy::store::{Compressor, ZstdCompressor}; use tantivy::tokenizer::TokenizerManager; use tantivy::{DateTime, IndexBuilder, IndexSettings}; +use time::OffsetDateTime; use tokio::runtime::Handle; use tokio::sync::Semaphore; use tracing::{Span, info, info_span, warn}; @@ -99,6 +100,7 @@ struct IndexerState { max_num_partitions: NonZeroU32, index_settings: IndexSettings, cooperative_indexing_opt: Option, + indexation_time_field_opt: Option, } impl IndexerState { @@ -300,7 +302,15 @@ impl IndexerState { .context("batch delta does not follow indexer checkpoint")?; let mut memory_usage_delta: i64 = 0; counters.num_doc_batches_in_workbench += 1; - for doc in batch.docs { + let indexation_time_opt = self + .indexation_time_field_opt + .map(|_| DateTime::from_utc(OffsetDateTime::now_utc())); + for mut doc in batch.docs { + if let (Some(indexation_time), Some(indexation_time_field)) = + (indexation_time_opt, self.indexation_time_field_opt) + { + doc.doc.add_date(indexation_time_field, indexation_time); + } let ProcessedDoc { doc, timestamp_opt, @@ -589,6 +599,17 @@ impl Indexer { cooperative_indexing_permits, ) }); + let indexation_time_field_opt = + doc_mapper + .indexation_time_field_name() + .and_then(|name| match schema.get_field(name) { + Ok(field) => Some(field), + Err(_) => { + warn!("failed to find indexation time field '{}' in schema", name); + None + } + }); + Self { indexer_state: IndexerState { pipeline_id, @@ -604,6 +625,7 @@ impl Indexer { index_settings, max_num_partitions: doc_mapper.max_num_partitions(), cooperative_indexing_opt, + indexation_time_field_opt, }, index_serializer_mailbox, indexing_workbench_opt: None, @@ -743,7 +765,7 @@ mod tests { EmptyResponse, LastDeleteOpstampResponse, MockMetastoreService, }; use quickwit_proto::types::{IndexUid, NodeId, PipelineUid}; - use tantivy::{DateTime, doc}; + use tantivy::{DateTime, DocAddress, ReloadPolicy, TantivyDocument, doc}; use super::*; use crate::actors::indexer::{IndexerCounters, record_timestamp}; @@ -1851,4 +1873,161 @@ mod tests { universe.assert_quit().await; Ok(()) } + + fn doc_mapper_with_indexation_time() -> DocMapper { + const JSON_CONFIG_VALUE: &str = r#" + { + "store_source": true, + "index_field_presence": true, + "default_search_fields": ["body"], + "timestamp_field": "timestamp", + "indexation_time_field": "indexed_at", + "field_mappings": [ + { + "name": "timestamp", + "type": "datetime", + "output_format": "unix_timestamp_secs", + "fast": true + }, + { + "name": "body", + "type": "text", + "stored": true + }, + { + "name": "indexed_at", + "type": "datetime", + "output_format": "unix_timestamp_secs", + "fast": true, + "stored": true + } + ] + }"#; + serde_json::from_str::(JSON_CONFIG_VALUE).unwrap() + } + + #[tokio::test] + async fn test_indexer_sets_indexation_time() -> anyhow::Result<()> { + let index_uid = IndexUid::new_with_random_ulid("test-index"); + let pipeline_id = IndexingPipelineId { + index_uid: index_uid.clone(), + source_id: "test-source".to_string(), + node_id: NodeId::from("test-node"), + pipeline_uid: PipelineUid::default(), + }; + let doc_mapper = Arc::new(doc_mapper_with_indexation_time()); + let last_delete_opstamp = 10; + let schema = doc_mapper.schema(); + let body_field = schema.get_field("body").unwrap(); + let timestamp_field = schema.get_field("timestamp").unwrap(); + let indexed_at_field = schema.get_field("indexed_at").unwrap(); + let indexing_directory = TempDirectory::for_test(); + let mut indexing_settings = IndexingSettings::for_test(); + indexing_settings.split_num_docs_target = 3; + let universe = Universe::with_accelerated_time(); + let (index_serializer_mailbox, index_serializer_inbox) = universe.create_test_mailbox(); + let mut mock_metastore = MockMetastoreService::new(); + mock_metastore + .expect_last_delete_opstamp() + .times(1) + .returning(move |delete_opstamp_request| { + assert_eq!(delete_opstamp_request.index_uid(), &index_uid); + Ok(LastDeleteOpstampResponse::new(last_delete_opstamp)) + }); + mock_metastore.expect_publish_splits().never(); + let indexer = Indexer::new( + pipeline_id, + doc_mapper, + MetastoreServiceClient::from_mock(mock_metastore), + indexing_directory, + indexing_settings, + None, + index_serializer_mailbox, + ); + let (indexer_mailbox, indexer_handle) = universe.spawn_builder().spawn(indexer); + + // Send 3 docs in a single batch so they all share the same indexation timestamp + // (the timestamp is sampled once per batch in `index_batch`). + indexer_mailbox + .send_message(ProcessedDocBatch::new( + vec![ + ProcessedDoc { + doc: doc!( + body_field => "document 1", + timestamp_field => DateTime::from_timestamp_secs(1_662_000_001), + ), + timestamp_opt: Some(DateTime::from_timestamp_secs(1_662_000_001)), + partition: 1, + num_bytes: 30, + }, + ProcessedDoc { + doc: doc!( + body_field => "document 2", + timestamp_field => DateTime::from_timestamp_secs(1_662_000_002), + ), + timestamp_opt: Some(DateTime::from_timestamp_secs(1_662_000_002)), + partition: 1, + num_bytes: 30, + }, + ProcessedDoc { + doc: doc!( + body_field => "document 3", + timestamp_field => DateTime::from_timestamp_secs(1_662_000_003), + ), + timestamp_opt: Some(DateTime::from_timestamp_secs(1_662_000_003)), + partition: 1, + num_bytes: 30, + }, + ], + SourceCheckpointDelta::from_range(0..3), + false, + )) + .await?; + + indexer_handle.process_pending_and_observe().await; + let messages: Vec = index_serializer_inbox.drain_for_test_typed(); + assert_eq!(messages.len(), 1); + let batch = messages.into_iter().next().unwrap(); + assert_eq!(batch.commit_trigger, CommitTrigger::NumDocsLimit); + assert_eq!(batch.splits.len(), 1); + assert_eq!(batch.splits[0].split_attrs.num_docs, 3); + + // Finalize the split and open the tantivy index to verify the `indexed_at` field. + let indexed_split = batch.splits.into_iter().next().unwrap().finalize()?; + let reader = indexed_split + .index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + + // Collect every `indexed_at` value present in the split. + let mut indexed_at_values: Vec = Vec::new(); + for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() { + for doc_id in 0..segment_reader.max_doc() { + let doc_address = DocAddress::new(segment_ord as u32, doc_id); + let doc: TantivyDocument = searcher.doc(doc_address)?; + let indexed_at = doc + .get_first(indexed_at_field) + .and_then(|val| val.as_datetime()) + .expect("indexed_at field must be set on every indexed document"); + indexed_at_values.push(indexed_at); + } + } + + // All 3 documents must have been stamped with the indexation time. + assert_eq!(indexed_at_values.len(), 3); + // Because the timestamp is captured once for the whole batch, every document + // in the batch must carry exactly the same `indexed_at` value. + let first = indexed_at_values[0]; + for val in &indexed_at_values { + assert_eq!( + *val, first, + "all documents in the same batch must share the same indexed_at timestamp" + ); + } + + universe.assert_quit().await; + Ok(()) + } } diff --git a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs index 99065651db1..65bd824b1b9 100644 --- a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs @@ -159,7 +159,9 @@ impl IndexingPipeline { let indexing_pipelines_gauge = crate::metrics::INDEXER_METRICS .indexing_pipelines .with_label_values([¶ms.pipeline_id.index_uid.index_id]); - let indexing_pipelines_gauge_guard = OwnedGaugeGuard::from_gauge(indexing_pipelines_gauge); + let mut indexing_pipelines_gauge_guard = + OwnedGaugeGuard::from_gauge(indexing_pipelines_gauge); + indexing_pipelines_gauge_guard.add(1); let params_fingerprint = params.params_fingerprint; IndexingPipeline { params, diff --git a/quickwit/quickwit-indexing/src/actors/indexing_service.rs b/quickwit/quickwit-indexing/src/actors/indexing_service.rs index afd2637c02c..363c9891f0c 100644 --- a/quickwit/quickwit-indexing/src/actors/indexing_service.rs +++ b/quickwit/quickwit-indexing/src/actors/indexing_service.rs @@ -16,6 +16,7 @@ use std::collections::{HashMap, HashSet}; use std::fmt::{Debug, Formatter}; use std::path::PathBuf; use std::sync::Arc; +use std::time::{Duration, Instant}; use anyhow::Context; use async_trait::async_trait; @@ -895,6 +896,7 @@ impl Handler for IndexingService { msg: ObservePipeline, _ctx: &ActorContext, ) -> Result { + let _slow_handler_guard = SlowHandlerGuard::new("observe_pipeline"); let pipeline_uid = msg.pipeline_id.pipeline_uid; let observation = self.observe_pipeline(&pipeline_uid).await; Ok(observation) @@ -910,6 +912,7 @@ impl Handler for IndexingService { msg: DetachIndexingPipeline, _ctx: &ActorContext, ) -> Result { + let _slow_handler_guard = SlowHandlerGuard::new("detach_indexing_pipeline"); let pipeline_uid = msg.pipeline_id.pipeline_uid; let detach_pipeline_result = self.detach_indexing_pipeline(&pipeline_uid).await; Ok(detach_pipeline_result) @@ -925,6 +928,7 @@ impl Handler for IndexingService { msg: DetachMergePipeline, _ctx: &ActorContext, ) -> Result { + let _slow_handler_guard = SlowHandlerGuard::new("detach_merge_pipeline"); Ok(self.detach_merge_pipeline(&msg.pipeline_id).await) } } @@ -941,6 +945,7 @@ impl Handler for IndexingService { _message: SuperviseLoop, ctx: &ActorContext, ) -> Result<(), ActorExitStatus> { + let _slow_handler_guard = SlowHandlerGuard::new("supervise_loop"); self.handle_supervise().await?; ctx.schedule_self_msg(*quickwit_actors::HEARTBEAT, SuperviseLoop); Ok(()) @@ -969,6 +974,7 @@ impl Handler for IndexingService { message: SpawnPipeline, ctx: &ActorContext, ) -> Result, ActorExitStatus> { + let _slow_handler_guard = SlowHandlerGuard::new("spawn_pipeline"); Ok(self .spawn_pipeline( ctx, @@ -989,6 +995,7 @@ impl Handler for IndexingService { plan_request: ApplyIndexingPlanRequest, ctx: &ActorContext, ) -> Result { + let _slow_handler_guard = SlowHandlerGuard::new("apply_indexing_plan"); Ok(self .apply_indexing_plan(&plan_request.indexing_tasks, ctx) .await @@ -1016,6 +1023,32 @@ struct IndexingPipelineDiff { pipelines_to_spawn: Vec, } +/// Logs a warning every 5 seconds until dropped. Useful to identify slow +/// handlers that might compromise liveness checks. +pub struct SlowHandlerGuard { + _cancel_tx: oneshot::Sender<()>, +} + +impl SlowHandlerGuard { + pub fn new(handler_name: &'static str) -> Self { + let (cancel_tx, mut cancel_rx) = oneshot::channel::<()>(); + let start = Instant::now(); + tokio::spawn(async move { + loop { + tokio::select! { + _ = tokio::time::sleep(Duration::from_secs(5)) => { + warn!(handler=handler_name, elapsed_secs=start.elapsed().as_secs(), "slow indexing service handler"); + } + _ = &mut cancel_rx => { break; } + } + } + }); + Self { + _cancel_tx: cancel_tx, + } + } +} + #[cfg(test)] mod tests { use std::num::NonZeroUsize; diff --git a/quickwit/quickwit-indexing/src/actors/merge_executor.rs b/quickwit/quickwit-indexing/src/actors/merge_executor.rs index 6b753c7e13b..660a8b62d05 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_executor.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_executor.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::BTreeSet; +use std::collections::{BTreeSet, HashMap}; use std::ops::RangeInclusive; use std::path::Path; use std::sync::Arc; @@ -40,20 +40,40 @@ use quickwit_query::query_ast::QueryAst; use tantivy::directory::{Advice, DirectoryClone, MmapDirectory, RamDirectory}; use tantivy::index::SegmentId; use tantivy::tokenizer::TokenizerManager; -use tantivy::{DateTime, Directory, Index, IndexMeta, IndexWriter, SegmentReader}; +use tantivy::{DateTime, Directory, DocId, Index, IndexMeta, IndexWriter, SegmentReader}; use tokio::runtime::Handle; use tracing::{debug, error, info, instrument, warn}; use crate::actors::Packager; use crate::controlled_directory::ControlledDirectory; use crate::merge_policy::MergeOperationType; -use crate::models::{IndexedSplit, IndexedSplitBatch, MergeScratch, PublishLock, SplitAttrs}; +use crate::models::{ + IndexedSplit, IndexedSplitBatch, MergeScratch, PublishLock, ReplacedSplit, SplitAttrs, +}; +use crate::soft_delete_query::SoftDeletedDocIdsQuery; + +/// The mapping resolution assiated to the merge. To perform deletes a full doc +/// mapper is required. For regular merges, we only need the tokenizer manager. +#[derive(Clone)] +enum MapperContext { + TokenizersOnly(quickwit_query::tokenizers::TokenizerManager), + DocMapper(Arc), +} + +impl MapperContext { + fn tokenizer_manager(&self) -> quickwit_query::tokenizers::TokenizerManager { + match self { + MapperContext::TokenizersOnly(tokenizer_manager) => tokenizer_manager.clone(), + MapperContext::DocMapper(doc_mapper) => doc_mapper.tokenizer_manager().clone(), + } + } +} #[derive(Clone)] pub struct MergeExecutor { pipeline_id: MergePipelineId, metastore: MetastoreServiceClient, - doc_mapper: Arc, + mapper_context: MapperContext, io_controls: IoControls, merge_packager_mailbox: Mailbox, } @@ -106,14 +126,16 @@ impl Handler for MergeExecutor { // A failure in a merge is a bit special. // // Instead of failing the pipeline, we just log it. - // The idea is to limit the risk associated with a potential split of death. + // The idea is to limit the risk associated with a potential split of + // death. // - // Such a split is now not tracked by the merge planner and won't undergo a - // merge until the merge pipeline is restarted. + // Such a split is now not tracked by the merge planner and won't + // undergo a merge until the merge pipeline + // is restarted. // - // With a merge policy that marks splits as mature after a day or so, this - // limits the noise associated to those failed - // merges. + // With a merge policy that marks splits as mature after a day or so, + // this limits the noise associated to those + // failed merges. error!(task=?merge_task, err=?err, "failed to merge splits"); return Ok(()); } @@ -171,21 +193,23 @@ fn combine_index_meta(mut index_metas: Vec) -> anyhow::Result>, Vec)>; + fn open_split_directories( // Directories containing the splits to merge tantivy_dirs: &[Box], tokenizer_manager: &TokenizerManager, -) -> anyhow::Result<(IndexMeta, Vec>)> { +) -> OpenSplitDirsResult { let mut directories: Vec> = Vec::new(); - let mut index_metas = Vec::new(); + let mut index_metas: Vec = Vec::new(); for tantivy_dir in tantivy_dirs { directories.push(tantivy_dir.clone()); - let index_meta = open_index(tantivy_dir.clone(), tokenizer_manager)?.load_metas()?; index_metas.push(index_meta); } + let per_split_metas = index_metas.clone(); let union_index_meta = combine_index_meta(index_metas)?; - Ok((union_index_meta, directories)) + Ok((union_index_meta, directories, per_split_metas)) } /// Creates a directory with a single `meta.json` file describe in `index_meta` @@ -278,11 +302,23 @@ pub fn merge_split_attrs( let partition_id = combine_partition_ids_aux(splits.iter().map(|split| split.partition_id)); let time_range: Option> = merge_time_range(splits); let secondary_time_range = merge_secondary_time_range_if_exists(splits); - let uncompressed_docs_size_in_bytes = sum_doc_sizes_in_bytes(splits); - let num_docs = sum_num_docs(splits); - let replaced_split_ids: Vec = splits + let total_soft_deleted: u64 = splits + .iter() + .map(|split| split.soft_deleted_doc_ids.len() as u64) + .sum(); + let raw_num_docs = sum_num_docs(splits); + let num_docs = raw_num_docs.saturating_sub(total_soft_deleted); + let uncompressed_docs_size_in_bytes = if raw_num_docs > 0 { + (sum_doc_sizes_in_bytes(splits) as f64 * num_docs as f64 / raw_num_docs as f64) as u64 + } else { + 0 + }; + let replaced_splits = splits .iter() - .map(|split| split.split_id().to_string()) + .map(|split| ReplacedSplit { + split_id: split.split_id.clone(), + soft_deleted_doc_ids: split.soft_deleted_doc_ids.clone(), + }) .collect(); let delete_opstamp = splits .iter() @@ -306,13 +342,13 @@ pub fn merge_split_attrs( doc_mapping_uid, split_id: merge_split_id, partition_id, - replaced_split_ids, time_range, secondary_time_range, num_docs, uncompressed_docs_size_in_bytes, delete_opstamp, num_merge_ops: max_merge_ops(splits) + 1, + replaced_splits, }) } @@ -324,6 +360,16 @@ fn max_merge_ops(splits: &[SplitMetadata]) -> usize { .unwrap_or(0) } +struct MergeDirectoriesInput { + union_index_meta: IndexMeta, + split_directories: Vec>, + delete_tasks: Vec, + /// Required when `delete_tasks` is non-empty; unused otherwise. + doc_mapper_opt: Option>, + /// Maps each segment ID to the sorted list of soft-deleted doc IDs to remove. + soft_deleted_docs: HashMap>, +} + impl MergeExecutor { pub fn new( pipeline_id: MergePipelineId, @@ -335,7 +381,24 @@ impl MergeExecutor { MergeExecutor { pipeline_id, metastore, - doc_mapper, + mapper_context: MapperContext::DocMapper(doc_mapper), + io_controls, + merge_packager_mailbox, + } + } + + /// Creates a simpler MergeExecutor that doesn't support deletes. + pub fn new_with_tokenizers_only( + pipeline_id: MergePipelineId, + metastore: MetastoreServiceClient, + tokenizer_manager: quickwit_query::tokenizers::TokenizerManager, + io_controls: IoControls, + merge_packager_mailbox: Mailbox, + ) -> Self { + MergeExecutor { + pipeline_id, + metastore, + mapper_context: MapperContext::TokenizersOnly(tokenizer_manager), io_controls, merge_packager_mailbox, } @@ -349,18 +412,33 @@ impl MergeExecutor { merge_scratch_directory: TempDirectory, ctx: &ActorContext, ) -> anyhow::Result { - let (union_index_meta, split_directories) = open_split_directories( + let (union_index_meta, split_directories, per_split_metas) = open_split_directories( &tantivy_dirs, - self.doc_mapper.tokenizer_manager().tantivy_manager(), + self.mapper_context.tokenizer_manager().tantivy_manager(), )?; + // Build a mapping from each segment ID to the soft-deleted doc IDs of its parent split. + let soft_deleted_docs: HashMap> = per_split_metas + .iter() + .zip(splits.iter()) + .filter(|(_, split)| !split.soft_deleted_doc_ids.is_empty()) + .flat_map(|(meta, split)| { + let doc_ids: Vec = split.soft_deleted_doc_ids.iter().copied().collect(); + meta.segments + .iter() + .map(move |seg_meta| (seg_meta.id(), doc_ids.clone())) + }) + .collect(); // TODO it would be nice if tantivy could let us run the merge in the current thread. fail_point!("before-merge-split"); let controlled_directory = self .merge_split_directories( - union_index_meta, - split_directories, - Vec::new(), - None, + MergeDirectoriesInput { + union_index_meta, + split_directories, + delete_tasks: Vec::new(), + doc_mapper_opt: None, + soft_deleted_docs, + }, merge_scratch_directory.path(), ctx, ) @@ -371,17 +449,18 @@ impl MergeExecutor { // splits. let merged_index = open_index( controlled_directory.clone(), - self.doc_mapper.tokenizer_manager().tantivy_manager(), + self.mapper_context.tokenizer_manager().tantivy_manager(), )?; ctx.record_progress(); let split_attrs = merge_split_attrs(self.pipeline_id.clone(), merge_split_id, &splits)?; - Ok(IndexedSplit { + let indexed_split = IndexedSplit { split_attrs, index: merged_index, split_scratch_directory: merge_scratch_directory, controlled_directory_opt: Some(controlled_directory), - }) + }; + Ok(indexed_split) } async fn process_delete_and_merge( @@ -392,6 +471,9 @@ impl MergeExecutor { merge_scratch_directory: TempDirectory, ctx: &ActorContext, ) -> anyhow::Result> { + let MapperContext::DocMapper(doc_mapper) = &self.mapper_context else { + anyhow::bail!("DocMapper is required to process delete and merge operations"); + }; let list_delete_tasks_request = ListDeleteTasksRequest::new(split.index_uid.clone(), split.delete_opstamp); let delete_tasks = ctx @@ -417,16 +499,34 @@ impl MergeExecutor { num_delete_tasks = delete_tasks.len() ); - let (union_index_meta, split_directories) = open_split_directories( + let (union_index_meta, split_directories, per_split_metas) = open_split_directories( &tantivy_dirs, - self.doc_mapper.tokenizer_manager().tantivy_manager(), + doc_mapper.tokenizer_manager().tantivy_manager(), )?; + // Build a mapping from each segment ID to the soft-deleted doc IDs of the input split. + let soft_deleted_docs: HashMap> = + if split.soft_deleted_doc_ids.is_empty() { + HashMap::new() + } else { + let doc_ids: Vec = split.soft_deleted_doc_ids.iter().copied().collect(); + per_split_metas + .iter() + .flat_map(|meta| { + meta.segments + .iter() + .map(|seg_meta| (seg_meta.id(), doc_ids.clone())) + }) + .collect() + }; let controlled_directory = self .merge_split_directories( - union_index_meta, - split_directories, - delete_tasks, - Some(self.doc_mapper.clone()), + MergeDirectoriesInput { + union_index_meta, + split_directories, + delete_tasks, + doc_mapper_opt: Some(doc_mapper.clone()), + soft_deleted_docs, + }, merge_scratch_directory.path(), ctx, ) @@ -435,12 +535,7 @@ impl MergeExecutor { // This will have the side effect of deleting the directory containing the downloaded split. let mut merged_index = Index::open(controlled_directory.clone())?; ctx.record_progress(); - merged_index.set_tokenizers( - self.doc_mapper - .tokenizer_manager() - .tantivy_manager() - .clone(), - ); + merged_index.set_tokenizers(doc_mapper.tokenizer_manager().tantivy_manager().clone()); merged_index.set_fast_field_tokenizers( get_quickwit_fastfield_normalizer_manager() .tantivy_manager() @@ -473,8 +568,7 @@ impl MergeExecutor { let uncompressed_docs_size_in_bytes = (num_docs as f32 * split.uncompressed_docs_size_in_bytes as f32 / split.num_docs as f32) as u64; - let time_range = if let Some(timestamp_field_name) = self.doc_mapper.timestamp_field_name() - { + let time_range = if let Some(timestamp_field_name) = doc_mapper.timestamp_field_name() { let reader = merged_segment_reader .fast_fields() .date(timestamp_field_name)?; @@ -490,13 +584,16 @@ impl MergeExecutor { doc_mapping_uid: split.doc_mapping_uid, split_id: merge_split_id, partition_id: split.partition_id, - replaced_split_ids: vec![split.split_id.clone()], time_range, secondary_time_range: None, num_docs, uncompressed_docs_size_in_bytes, delete_opstamp: last_delete_opstamp, num_merge_ops: split.num_merge_ops, + replaced_splits: vec![ReplacedSplit { + split_id: split.split_id.clone(), + soft_deleted_doc_ids: split.soft_deleted_doc_ids.clone(), + }], }, index: merged_index, split_scratch_directory: merge_scratch_directory, @@ -507,13 +604,17 @@ impl MergeExecutor { async fn merge_split_directories( &self, - union_index_meta: IndexMeta, - split_directories: Vec>, - delete_tasks: Vec, - doc_mapper_opt: Option>, + input: MergeDirectoriesInput, output_path: &Path, ctx: &ActorContext, ) -> anyhow::Result { + let MergeDirectoriesInput { + union_index_meta, + split_directories, + delete_tasks, + doc_mapper_opt, + soft_deleted_docs, + } = input; let shadowing_meta_json_directory = create_shadowing_meta_json_directory(union_index_meta)?; // This directory is here to receive the merged split, as well as the final meta.json file. @@ -535,7 +636,7 @@ impl MergeExecutor { let union_directory = UnionDirectory::union_of(directory_stack); let union_index = open_index( union_directory, - self.doc_mapper.tokenizer_manager().tantivy_manager(), + self.mapper_context.tokenizer_manager().tantivy_manager(), )?; ctx.record_progress(); @@ -543,6 +644,12 @@ impl MergeExecutor { let mut index_writer: IndexWriter = union_index.writer_with_num_threads(1, 15_000_000)?; let num_delete_tasks = delete_tasks.len(); + let has_soft_deletes = !soft_deleted_docs.is_empty(); + // Hard-delete soft-deleted doc IDs before applying delete-task queries so that both + // sources of deletion are committed together in a single pass. + if has_soft_deletes { + index_writer.delete_query(Box::new(SoftDeletedDocIdsQuery::new(soft_deleted_docs)))?; + } if num_delete_tasks > 0 { let doc_mapper = doc_mapper_opt .ok_or_else(|| anyhow!("doc mapper must be present if there are delete tasks"))?; @@ -564,6 +671,8 @@ impl MergeExecutor { doc_mapper.query(union_index.schema(), parsed_query_ast, false, None)?; index_writer.delete_query(query)?; } + } + if has_soft_deletes || num_delete_tasks > 0 { debug!("commit-delete-operations"); index_writer.commit()?; } @@ -574,13 +683,13 @@ impl MergeExecutor { .map(|segment_meta| segment_meta.id()) .collect(); - // A merge is useless if there is no delete and only one segment. - if num_delete_tasks == 0 && segment_ids.len() <= 1 { + // A merge is useless if there are no deletions and only one segment. + if !has_soft_deletes && num_delete_tasks == 0 && segment_ids.len() <= 1 { return Ok(output_directory); } - // If after deletion there is no longer any document, don't try to merge. - if num_delete_tasks != 0 && segment_ids.is_empty() { + // If after deletion there are no remaining documents, don't try to merge. + if (has_soft_deletes || num_delete_tasks != 0) && segment_ids.is_empty() { return Ok(output_directory); } @@ -713,6 +822,287 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_merge_executor_with_soft_deleted_docs() -> anyhow::Result<()> { + let doc_mapping_yaml = r#" + field_mappings: + - name: body + type: text + - name: ts + type: datetime + input_formats: + - unix_timestamp + fast: true + timestamp_field: ts + "#; + let test_sandbox = + TestSandbox::create("test-index-soft-delete", doc_mapping_yaml, "", &["body"]).await?; + for split_id in 0..4 { + let single_doc = std::iter::once( + serde_json::json!({"body ": format!("split{split_id}"), "ts": 1631072713u64 + split_id }), + ); + test_sandbox.add_documents(single_doc).await?; + } + let metastore = test_sandbox.metastore(); + let index_uid = test_sandbox.index_uid(); + + // Load the initial split metadata to obtain split IDs. + let split_metas: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits_metadata() + .await + .unwrap(); + assert_eq!(split_metas.len(), 4); + + // Soft-delete doc_id=0 from the first split. + // Each split contains exactly one document, so doc_id=0 is the only document. + let soft_deleted_split_id = split_metas[0].split_id.clone(); + metastore + .soft_delete_documents(quickwit_proto::metastore::SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![quickwit_proto::metastore::SplitDocIds { + split_id: soft_deleted_split_id, + doc_ids: vec![0], + }], + }) + .await?; + + // Reload split metadata so that soft_deleted_doc_ids is populated. + let split_metas: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits_metadata() + .await + .unwrap(); + assert_eq!( + split_metas + .iter() + .map(|s| s.soft_deleted_doc_ids.len()) + .sum::(), + 1, + "exactly one doc should be soft-deleted across all splits" + ); + + let merge_scratch_directory = TempDirectory::for_test(); + let downloaded_splits_directory = + merge_scratch_directory.named_temp_child("downloaded-splits-")?; + let mut tantivy_dirs: Vec> = Vec::new(); + for split_meta in &split_metas { + let split_filename = split_file(split_meta.split_id()); + let dest_filepath = downloaded_splits_directory.path().join(&split_filename); + test_sandbox + .storage() + .copy_to_file(Path::new(&split_filename), &dest_filepath) + .await?; + tantivy_dirs.push(get_tantivy_directory_from_split_bundle(&dest_filepath).unwrap()) + } + let merge_operation = MergeOperation::new_merge_operation(split_metas); + let merge_task = MergeTask::from_merge_operation_for_test(merge_operation); + let merge_scratch = MergeScratch { + merge_task, + tantivy_dirs, + merge_scratch_directory, + downloaded_splits_directory, + }; + let pipeline_id = MergePipelineId { + node_id: test_sandbox.node_id(), + index_uid: index_uid.clone(), + source_id: test_sandbox.source_id(), + }; + let (merge_packager_mailbox, merge_packager_inbox) = + test_sandbox.universe().create_test_mailbox(); + let merge_executor = MergeExecutor::new( + pipeline_id, + test_sandbox.metastore(), + test_sandbox.doc_mapper(), + IoControls::default(), + merge_packager_mailbox, + ); + let (merge_executor_mailbox, merge_executor_handle) = test_sandbox + .universe() + .spawn_builder() + .spawn(merge_executor); + merge_executor_mailbox.send_message(merge_scratch).await?; + merge_executor_handle.process_pending_and_observe().await; + + let packager_msgs: Vec = merge_packager_inbox.drain_for_test_typed(); + assert_eq!(packager_msgs.len(), 1); + let split_attrs_after_merge = &packager_msgs[0].splits[0].split_attrs; + // One document was soft-deleted, so only 3 docs should remain. + assert_eq!(split_attrs_after_merge.num_docs, 3); + assert_eq!(split_attrs_after_merge.uncompressed_docs_size_in_bytes, 102); + assert_eq!(split_attrs_after_merge.num_merge_ops, 1); + + let reader = packager_msgs[0].splits[0] + .index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + assert_eq!(searcher.segment_readers().len(), 1); + // The merged segment must contain exactly 3 live documents. + let num_live_docs: u32 = searcher + .segment_readers() + .iter() + .map(|r| r.num_docs()) + .sum(); + assert_eq!(num_live_docs, 3); + + test_sandbox.assert_quit().await; + Ok(()) + } + + /// Verifies that when a soft-delete lands on an input split while the + /// merge is running, the merge still succeeds. + #[tokio::test] + async fn test_merge_executor_soft_delete_race_condition() -> anyhow::Result<()> { + let doc_mapping_yaml = r#" + field_mappings: + - name: body + type: text + - name: ts + type: datetime + input_formats: + - unix_timestamp + fast: true + timestamp_field: ts + "#; + let test_sandbox = TestSandbox::create( + "test-index-soft-delete-race", + doc_mapping_yaml, + "", + &["body"], + ) + .await?; + for split_id in 0..4 { + let single_doc = std::iter::once( + serde_json::json!({"body": format!("split{split_id}"), "ts": 1631072713u64 + split_id}), + ); + test_sandbox.add_documents(single_doc).await?; + } + let metastore = test_sandbox.metastore(); + let index_uid = test_sandbox.index_uid(); + + // Read split metadata *before* the soft-delete — this is the stale snapshot that the + // merge task will carry, simulating a race where the delete arrives after the merge + // executor already read the metadata. + let stale_split_metas: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits_metadata() + .await + .unwrap(); + assert_eq!(stale_split_metas.len(), 4); + + // Soft-delete doc_id=0 from the first split *after* the stale metadata was read. + // This simulates a concurrent user action that arrives while the merge is running. + let racing_split_id = stale_split_metas[0].split_id.clone(); + metastore + .soft_delete_documents(quickwit_proto::metastore::SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![quickwit_proto::metastore::SplitDocIds { + split_id: racing_split_id.clone(), + doc_ids: vec![0], + }], + }) + .await?; + + // Build the merge scratch using the stale metadata (no soft-deletes recorded). + let merge_scratch_directory = TempDirectory::for_test(); + let downloaded_splits_directory = + merge_scratch_directory.named_temp_child("downloaded-splits-")?; + let mut tantivy_dirs: Vec> = Vec::new(); + for split_meta in &stale_split_metas { + let split_filename = split_file(split_meta.split_id()); + let dest_filepath = downloaded_splits_directory.path().join(&split_filename); + test_sandbox + .storage() + .copy_to_file(Path::new(&split_filename), &dest_filepath) + .await?; + tantivy_dirs.push(get_tantivy_directory_from_split_bundle(&dest_filepath).unwrap()); + } + let merge_operation = MergeOperation::new_merge_operation(stale_split_metas); + let merge_task = MergeTask::from_merge_operation_for_test(merge_operation); + let merge_scratch = MergeScratch { + merge_task, + tantivy_dirs, + merge_scratch_directory, + downloaded_splits_directory, + }; + let pipeline_id = MergePipelineId { + node_id: test_sandbox.node_id(), + index_uid: index_uid.clone(), + source_id: test_sandbox.source_id(), + }; + let (merge_packager_mailbox, merge_packager_inbox) = + test_sandbox.universe().create_test_mailbox(); + let merge_executor = MergeExecutor::new( + pipeline_id, + test_sandbox.metastore(), + test_sandbox.doc_mapper(), + IoControls::default(), + merge_packager_mailbox, + ); + let (merge_executor_mailbox, merge_executor_handle) = test_sandbox + .universe() + .spawn_builder() + .spawn(merge_executor); + merge_executor_mailbox.send_message(merge_scratch).await?; + merge_executor_handle.process_pending_and_observe().await; + + // The merge must succeed despite the race condition. + let packager_msgs: Vec = merge_packager_inbox.drain_for_test_typed(); + assert_eq!( + packager_msgs.len(), + 1, + "merge must produce exactly one split batch" + ); + + let split_attrs = &packager_msgs[0].splits[0].split_attrs; + // The stale metadata had no soft-deletes, so all 4 docs are present in the merged + // segment. The racing soft-delete was missed. + assert_eq!(split_attrs.num_docs, 4); + assert_eq!(split_attrs.num_merge_ops, 1); + + // The snapshot carried in the batch reflects the stale state (no soft-deletes). + let replaced_splits = &packager_msgs[0].splits[0].split_attrs.replaced_splits; + assert_eq!( + replaced_splits.len(), + 4, + "all 4 input splits must appear in the snapshot" + ); + let racing_split_snapshot = replaced_splits + .iter() + .find(|replaced_split| replaced_split.split_id == racing_split_id) + .expect("racing split must be present in the snapshot"); + assert!( + racing_split_snapshot.soft_deleted_doc_ids.is_empty(), + "racing split had no soft-deletes at merge start (stale read)" + ); + + let reader = packager_msgs[0].splits[0] + .index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + assert_eq!(searcher.segment_readers().len(), 1); + let num_live_docs: u32 = searcher + .segment_readers() + .iter() + .map(|r| r.num_docs()) + .sum(); + // All 4 docs are physically present; the racing soft-delete was not applied. + assert_eq!(num_live_docs, 4); + + test_sandbox.assert_quit().await; + Ok(()) + } + #[test] fn test_combine_partition_ids_singleton_unchanged() { assert_eq!(combine_partition_ids_aux([17]), 17); @@ -950,4 +1340,204 @@ mod tests { ) .await } + + #[tokio::test] + async fn test_delete_and_merge_with_soft_deleted_docs() -> anyhow::Result<()> { + quickwit_common::setup_logging_for_tests(); + let doc_mapping_yaml = r#" + field_mappings: + - name: body + type: text + - name: ts + type: datetime + input_formats: + - unix_timestamp + fast: true + timestamp_field: ts + "#; + let test_sandbox = TestSandbox::create( + "test-delete-and-merge-with-soft-delete", + doc_mapping_yaml, + "", + &["body"], + ) + .await?; + + // Three docs are ingested into a single split. + // doc_id=0 body="soft_delete" → removed by soft-delete + // doc_id=1 body="query_delete" → removed by the delete query + // doc_id=2 body="keep" → must survive both conditions + test_sandbox + .add_documents(vec![ + serde_json::json!({"body": "soft_delete", "ts": 1624928200}), + serde_json::json!({"body": "query_delete", "ts": 1624928201}), + serde_json::json!({"body": "keep", "ts": 1624928202}), + ]) + .await?; + + let metastore = test_sandbox.metastore(); + let index_uid = test_sandbox.index_uid(); + + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + assert_eq!(splits.len(), 1); + let original_split_id = splits[0].split_metadata.split_id.clone(); + + // Soft-delete doc_id=0 (the "soft_delete" document). + metastore + .soft_delete_documents(quickwit_proto::metastore::SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![quickwit_proto::metastore::SplitDocIds { + split_id: original_split_id.clone(), + doc_ids: vec![0], + }], + }) + .await?; + + // Register a delete task targeting the "query_delete" document. + metastore + .create_delete_task(DeleteQuery { + index_uid: Some(index_uid.clone()), + start_timestamp: None, + end_timestamp: None, + query_ast: quickwit_query::query_ast::qast_json_helper( + "body:query_delete", + &["body"], + ), + }) + .await?; + + // Reload split metadata so that soft_deleted_doc_ids is populated. + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + assert_eq!(splits.len(), 1); + assert_eq!( + splits[0].split_metadata.soft_deleted_doc_ids.len(), + 1, + "doc_id=0 must be recorded as soft-deleted before staging" + ); + + // Stage a replacement split with num_merge_ops=1. By cloning the freshly-read + // metadata the soft_deleted_doc_ids field is carried over into the merge task, + // which is exactly what process_delete_and_merge relies on. + let mut new_split_metadata = splits[0].split_metadata.clone(); + new_split_metadata.split_id = new_split_id(); + new_split_metadata.num_merge_ops = 1; + let stage_splits_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &new_split_metadata) + .unwrap(); + metastore.stage_splits(stage_splits_request).await.unwrap(); + let publish_splits_request = PublishSplitsRequest { + index_uid: Some(index_uid.clone()), + staged_split_ids: vec![new_split_metadata.split_id.to_string()], + replaced_split_ids: vec![original_split_id.clone()], + index_checkpoint_delta_json_opt: None, + publish_token_opt: None, + }; + metastore + .publish_splits(publish_splits_request) + .await + .unwrap(); + + // Copy the original split bundle to the new split filename so the executor can open it. + let merge_scratch_directory = TempDirectory::for_test(); + let downloaded_splits_directory = + merge_scratch_directory.named_temp_child("downloaded-splits-")?; + let split_filename = split_file(&original_split_id); + let new_split_filename = split_file(new_split_metadata.split_id()); + let dest_filepath = downloaded_splits_directory.path().join(&new_split_filename); + test_sandbox + .storage() + .copy_to_file(Path::new(&split_filename), &dest_filepath) + .await?; + let tantivy_dir = get_tantivy_directory_from_split_bundle(&dest_filepath).unwrap(); + let merge_operation = MergeOperation::new_delete_and_merge_operation(new_split_metadata); + let merge_task = MergeTask::from_merge_operation_for_test(merge_operation); + let merge_scratch = MergeScratch { + merge_task, + tantivy_dirs: vec![tantivy_dir], + merge_scratch_directory, + downloaded_splits_directory, + }; + let pipeline_id = MergePipelineId { + node_id: test_sandbox.node_id(), + index_uid: test_sandbox.index_uid(), + source_id: test_sandbox.source_id(), + }; + let universe = Universe::with_accelerated_time(); + let (merge_packager_mailbox, merge_packager_inbox) = universe.create_test_mailbox(); + let merge_executor = MergeExecutor::new( + pipeline_id, + metastore, + test_sandbox.doc_mapper(), + IoControls::default(), + merge_packager_mailbox, + ); + let (merge_executor_mailbox, merge_executor_handle) = + universe.spawn_builder().spawn(merge_executor); + merge_executor_mailbox.send_message(merge_scratch).await?; + merge_executor_handle.process_pending_and_observe().await; + + let packager_msgs: Vec = merge_packager_inbox.drain_for_test_typed(); + assert_eq!(packager_msgs.len(), 1); + let split = &packager_msgs[0].splits[0]; + + // 3 docs − 1 soft-deleted − 1 query-deleted = 1 surviving document. + assert_eq!(split.split_attrs.num_docs, 1); + assert_eq!(split.split_attrs.delete_opstamp, 1); + // Delete operations must not increment num_merge_ops. + assert_eq!(split.split_attrs.num_merge_ops, 1); + + let reader = split + .index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + assert_eq!(searcher.segment_readers().len(), 1); + + let num_live_docs: u32 = searcher + .segment_readers() + .iter() + .map(|r| r.num_docs()) + .sum(); + assert_eq!( + num_live_docs, 1, + "exactly one document must remain after all deletions" + ); + + // The surviving document must be the "keep" one. + let documents_left: Vec = searcher + .search( + &tantivy::query::AllQuery, + &tantivy::collector::TopDocs::with_limit(10).order_by_score(), + )? + .into_iter() + .map(|(_, doc_address)| { + let doc: TantivyDocument = searcher.doc(doc_address).unwrap(); + let doc_json = doc.to_json(searcher.schema()); + serde_json::from_str(&doc_json).unwrap() + }) + .collect(); + let expected_doc = serde_json::json!({"body": ["keep"], "ts": ["2021-06-29T00:56:42Z"]}); + assert_eq!( + documents_left, + vec![expected_doc], + "only the 'keep' document must survive both soft-delete and query-delete" + ); + + test_sandbox.assert_quit().await; + universe.assert_quit().await; + Ok(()) + } } diff --git a/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs b/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs index bbe5267d514..3818edd8c73 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs @@ -33,6 +33,15 @@ pub struct MergePermit { } impl MergePermit { + /// Creates a `MergePermit` from an owned semaphore permit, without notifying any + /// `MergeSchedulerService` on drop. Use this when managing concurrency externally. + pub fn new(permit: OwnedSemaphorePermit) -> MergePermit { + MergePermit { + _semaphore_permit: Some(permit), + merge_scheduler_mailbox: None, + } + } + #[cfg(any(test, feature = "testsuite"))] pub fn for_test() -> MergePermit { MergePermit { diff --git a/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs b/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs index 5d68bb59285..7d124288288 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs @@ -17,7 +17,7 @@ use std::path::Path; use async_trait::async_trait; use quickwit_actors::{Actor, ActorContext, ActorExitStatus, Handler, Mailbox, QueueCapacity}; use quickwit_common::io::IoControls; -use quickwit_common::temp_dir::{self, TempDirectory}; +use quickwit_common::temp_dir::TempDirectory; use quickwit_metastore::SplitMetadata; use tantivy::Directory; use tracing::{debug, info, instrument}; @@ -62,14 +62,13 @@ impl Handler for MergeSplitDownloader { merge_task: MergeTask, ctx: &ActorContext, ) -> Result<(), quickwit_actors::ActorExitStatus> { - let merge_scratch_directory = temp_dir::Builder::default() - .join("merge") - .tempdir_in(self.scratch_directory.path()) + let merge_scratch_directory = self + .scratch_directory + .named_temp_child("merge") .map_err(|error| anyhow::anyhow!(error))?; info!(dir=%merge_scratch_directory.path().display(), "download-merge-splits"); - let downloaded_splits_directory = temp_dir::Builder::default() - .join("downloaded-splits") - .tempdir_in(merge_scratch_directory.path()) + let downloaded_splits_directory = merge_scratch_directory + .named_temp_child("downloaded-splits") .map_err(|error| anyhow::anyhow!(error))?; let tantivy_dirs = self .download_splits( diff --git a/quickwit/quickwit-indexing/src/actors/packager.rs b/quickwit/quickwit-indexing/src/actors/packager.rs index 18e0bb40d73..ee43e050a5c 100644 --- a/quickwit/quickwit-indexing/src/actors/packager.rs +++ b/quickwit/quickwit-indexing/src/actors/packager.rs @@ -527,9 +527,9 @@ mod tests { uncompressed_docs_size_in_bytes: num_docs * 15, time_range: timerange_opt, secondary_time_range: None, - replaced_split_ids: Vec::new(), delete_opstamp: 0, num_merge_ops: 0, + replaced_splits: Vec::new(), }, index, split_scratch_directory, diff --git a/quickwit/quickwit-indexing/src/actors/publisher.rs b/quickwit/quickwit-indexing/src/actors/publisher.rs index b05081be706..2d85ca1a1af 100644 --- a/quickwit/quickwit-indexing/src/actors/publisher.rs +++ b/quickwit/quickwit-indexing/src/actors/publisher.rs @@ -12,22 +12,30 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::{BTreeSet, HashMap}; + use anyhow::Context; use async_trait::async_trait; use fail::fail_point; use quickwit_actors::{Actor, ActorContext, Handler, Mailbox, QueueCapacity}; -use quickwit_proto::metastore::{MetastoreService, MetastoreServiceClient, PublishSplitsRequest}; +use quickwit_common::Progress; +use quickwit_metastore::{ListSplitsQuery, ListSplitsRequestExt, MetastoreServiceStreamSplitsExt}; +use quickwit_proto::metastore::{ + ListSplitsRequest, MetastoreService, MetastoreServiceClient, PublishSplitsRequest, +}; +use quickwit_proto::types::{IndexUid, SplitId}; use serde::Serialize; -use tracing::{info, instrument, warn}; +use tracing::{error, info, instrument, warn}; use crate::actors::MergePlanner; -use crate::models::{NewSplits, SplitsUpdate}; +use crate::models::{NewSplits, ReplacedSplit, SplitsUpdate}; use crate::source::{SourceActor, SuggestTruncate}; #[derive(Clone, Debug, Default, Serialize)] pub struct PublisherCounters { pub num_published_splits: u64, pub num_replace_operations: u64, + pub num_replaced_splits: u64, pub num_empty_splits: u64, } @@ -127,10 +135,10 @@ impl Handler for Publisher { let SplitsUpdate { index_uid, new_splits, - replaced_split_ids, checkpoint_delta_opt, publish_lock, publish_token_opt, + replaced_splits, .. } = split_update; @@ -143,11 +151,24 @@ impl Handler for Publisher { .iter() .map(|split| split.split_id.clone()) .collect(); + let replaced_split_ids = replaced_splits + .iter() + .map(|replaced| replaced.split_id.clone()) + .collect(); if let Some(_guard) = publish_lock.acquire().await { + if !replaced_splits.is_empty() { + warn_if_soft_deletes_changed_during_merge( + &index_uid, + &replaced_splits, + &self.metastore, + ctx.progress(), + ) + .await; + } let publish_splits_request = PublishSplitsRequest { index_uid: Some(index_uid), staged_split_ids: split_ids.clone(), - replaced_split_ids: replaced_split_ids.clone(), + replaced_split_ids, index_checkpoint_delta_json_opt, publish_token_opt: publish_token_opt.clone(), }; @@ -194,10 +215,11 @@ impl Handler for Publisher { .await; } - if replaced_split_ids.is_empty() { + if replaced_splits.is_empty() { self.counters.num_published_splits += 1; } else { self.counters.num_replace_operations += 1; + self.counters.num_replaced_splits += replaced_splits.len() as u64; } } else { self.counters.num_empty_splits += 1; @@ -207,6 +229,73 @@ impl Handler for Publisher { } } +/// Re-reads the soft-deleted doc IDs for all input splits from the metastore and logs an +/// error for each split whose soft-delete set grew while the merge was running. +async fn warn_if_soft_deletes_changed_during_merge( + index_uid: &IndexUid, + replaced_splits: &[ReplacedSplit], + metastore: &MetastoreServiceClient, + progress: &Progress, +) { + let query = ListSplitsQuery::for_index(index_uid.clone()).with_split_ids( + replaced_splits + .iter() + .map(|replaced| replaced.split_id.clone()) + .collect(), + ); + + let list_splits_request = match ListSplitsRequest::try_from_list_splits_query(&query) { + Ok(request) => request, + Err(err) => { + warn!(error = ?err, "failed to build list_splits request for soft-delete race detection"); + return; + } + }; + let splits_stream = match progress + .protect_future(metastore.list_splits(list_splits_request)) + .await + { + Ok(stream) => stream, + Err(err) => { + warn!(error = ?err, "failed to list splits for soft-delete race detection"); + return; + } + }; + let fresh_splits = match progress + .protect_future(splits_stream.collect_splits_metadata()) + .await + { + Ok(splits) => splits, + Err(err) => { + warn!(error = ?err, "failed to collect split metadata for soft-delete race detection"); + return; + } + }; + let snapshot: HashMap<&SplitId, &BTreeSet> = replaced_splits + .iter() + .map(|n| (&n.split_id, &n.soft_deleted_doc_ids)) + .collect(); + for fresh_split in &fresh_splits { + let Some(snapshot_ids) = snapshot.get(&fresh_split.split_id) else { + continue; + }; + let missed: BTreeSet = fresh_split + .soft_deleted_doc_ids + .difference(snapshot_ids) + .copied() + .collect(); + if !missed.is_empty() { + // TODO: this means that the merge didn't include some committed + // soft deletes. Those are lost. + error!( + split_id = %fresh_split.split_id, + num_missed_soft_deletes = missed.len(), + "soft-delete race condition detected", + ); + } + } +} + #[cfg(test)] mod tests { use quickwit_actors::Universe; @@ -262,7 +351,6 @@ mod tests { split_id: "split".to_string(), ..Default::default() }], - replaced_split_ids: Vec::new(), checkpoint_delta_opt: Some(IndexCheckpointDelta { source_id: "source".to_string(), source_delta: SourceCheckpointDelta::from_range(1..3), @@ -271,6 +359,7 @@ mod tests { publish_token_opt: None, merge_task: None, parent_span: tracing::Span::none(), + replaced_splits: Vec::new(), }) .await .is_ok() @@ -278,6 +367,7 @@ mod tests { let publisher_observation = publisher_handle.process_pending_and_observe().await.state; assert_eq!(publisher_observation.num_published_splits, 1); + assert_eq!(publisher_observation.num_replaced_splits, 0); let suggest_truncate_checkpoints: Vec = source_inbox .drain_for_test_typed::() @@ -337,7 +427,6 @@ mod tests { .send_message(SplitsUpdate { index_uid: ref_index_uid.clone(), new_splits: Vec::new(), - replaced_split_ids: Vec::new(), checkpoint_delta_opt: Some(IndexCheckpointDelta { source_id: "source".to_string(), source_delta: SourceCheckpointDelta::from_range(1..3), @@ -346,6 +435,7 @@ mod tests { publish_token_opt: None, merge_task: None, parent_span: tracing::Span::none(), + replaced_splits: Vec::new(), }) .await .is_ok() @@ -354,6 +444,7 @@ mod tests { let publisher_observation = publisher_handle.process_pending_and_observe().await.state; assert_eq!(publisher_observation.num_published_splits, 0); assert_eq!(publisher_observation.num_replace_operations, 0); + assert_eq!(publisher_observation.num_replaced_splits, 0); assert_eq!(publisher_observation.num_empty_splits, 1); let suggest_truncate_checkpoints: Vec = source_inbox @@ -381,12 +472,21 @@ mod tests { let mut mock_metastore = MockMetastoreService::new(); let ref_index_uid: IndexUid = IndexUid::for_test("index", 1); let ref_index_uid_clone = ref_index_uid.clone(); + mock_metastore.expect_list_splits().times(1).returning(|_| { + use quickwit_common::ServiceStream; + use quickwit_metastore::ListSplitsResponseExt; + use quickwit_proto::metastore::ListSplitsResponse; + let response = ListSplitsResponse::try_from_splits(vec![]).unwrap(); + Ok(ServiceStream::from(vec![Ok(response)])) + }); mock_metastore .expect_publish_splits() .withf(move |publish_splits_requests| { + let mut replaced_split_ids = publish_splits_requests.replaced_split_ids.clone(); + replaced_split_ids.sort(); publish_splits_requests.index_uid() == &ref_index_uid_clone && publish_splits_requests.staged_split_ids[..] == ["split3"] - && publish_splits_requests.replaced_split_ids[..] == ["split1", "split2"] + && replaced_split_ids[..] == ["split1", "split2"] && publish_splits_requests .index_checkpoint_delta_json_opt() .is_empty() @@ -407,12 +507,21 @@ mod tests { split_id: "split3".to_string(), ..Default::default() }], - replaced_split_ids: vec!["split1".to_string(), "split2".to_string()], checkpoint_delta_opt: None, publish_lock: PublishLock::default(), publish_token_opt: None, merge_task: None, parent_span: Span::none(), + replaced_splits: vec![ + ReplacedSplit { + split_id: "split1".to_string(), + ..Default::default() + }, + ReplacedSplit { + split_id: "split2".to_string(), + ..Default::default() + }, + ], }; assert!( publisher_mailbox @@ -423,6 +532,7 @@ mod tests { let publisher_observation = publisher_handle.process_pending_and_observe().await.state; assert_eq!(publisher_observation.num_published_splits, 0); assert_eq!(publisher_observation.num_replace_operations, 1); + assert_eq!(publisher_observation.num_replaced_splits, 2); let merge_planner_msgs = merge_planner_inbox.drain_for_test_typed::(); assert_eq!(merge_planner_msgs.len(), 1); assert_eq!(merge_planner_msgs[0].new_splits.len(), 1); @@ -451,21 +561,99 @@ mod tests { .send_message(SplitsUpdate { index_uid: IndexUid::new_with_random_ulid("index"), new_splits: vec![SplitMetadata::for_test("test-split".to_string())], - replaced_split_ids: Vec::new(), checkpoint_delta_opt: None, publish_lock, publish_token_opt: None, merge_task: None, parent_span: Span::none(), + replaced_splits: Vec::new(), }) .await .unwrap(); let publisher_observation = publisher_handle.process_pending_and_observe().await.state; assert_eq!(publisher_observation.num_published_splits, 0); + assert_eq!(publisher_observation.num_replaced_splits, 0); let merger_messages = merge_planner_inbox.drain_for_test(); assert!(merger_messages.is_empty()); universe.assert_quit().await; } + + #[tokio::test] + async fn test_publisher_warns_on_soft_delete_race_condition() { + use std::collections::BTreeSet; + + use quickwit_common::ServiceStream; + use quickwit_metastore::{ListSplitsResponseExt, Split, SplitState}; + use quickwit_proto::metastore::ListSplitsResponse; + + let universe = Universe::with_accelerated_time(); + let ref_index_uid: IndexUid = IndexUid::for_test("index", 1); + let racing_split_id = "racing-split".to_string(); + + let mut mock_metastore = MockMetastoreService::new(); + + // list_splits returns the racing split with a new soft-delete absent from the snapshot. + let racing_split_id_clone = racing_split_id.clone(); + mock_metastore + .expect_list_splits() + .times(1) + .returning(move |_| { + let split = Split { + split_metadata: SplitMetadata { + split_id: racing_split_id_clone.clone(), + soft_deleted_doc_ids: BTreeSet::from([0u32]), + ..Default::default() + }, + split_state: SplitState::Published, + update_timestamp: 0, + publish_timestamp: None, + }; + let response = ListSplitsResponse::try_from_splits(vec![split]).unwrap(); + Ok(ServiceStream::from(vec![Ok(response)])) + }); + + mock_metastore + .expect_publish_splits() + .times(1) + .returning(|_| Ok(EmptyResponse {})); + + let publisher = Publisher::new( + PublisherType::MergePublisher, + MetastoreServiceClient::from_mock(mock_metastore), + None, + None, + ); + let (publisher_mailbox, publisher_handle) = universe.spawn_builder().spawn(publisher); + + // Snapshot shows the racing split had no soft-deletes at merge start (stale read). + let replaced_splits = vec![ReplacedSplit { + split_id: racing_split_id.clone(), + ..Default::default() + }]; + + publisher_mailbox + .send_message(SplitsUpdate { + index_uid: ref_index_uid.clone(), + new_splits: vec![SplitMetadata { + split_id: "merged-split".to_string(), + ..Default::default() + }], + checkpoint_delta_opt: None, + publish_lock: PublishLock::default(), + publish_token_opt: None, + merge_task: None, + parent_span: Span::none(), + replaced_splits, + }) + .await + .unwrap(); + + // Publish must still succeed despite the race condition (warning is non-fatal). + let observation = publisher_handle.process_pending_and_observe().await.state; + assert_eq!(observation.num_replace_operations, 1); + assert_eq!(observation.num_replaced_splits, 1); + universe.assert_quit().await; + } } diff --git a/quickwit/quickwit-indexing/src/actors/uploader.rs b/quickwit/quickwit-indexing/src/actors/uploader.rs index 2a012858587..1d9e71d87ba 100644 --- a/quickwit/quickwit-indexing/src/actors/uploader.rs +++ b/quickwit/quickwit-indexing/src/actors/uploader.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; -use std::iter::FromIterator; use std::mem; use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; @@ -27,11 +25,9 @@ use quickwit_actors::{Actor, ActorContext, ActorExitStatus, Handler, Mailbox, Qu use quickwit_common::pubsub::EventBroker; use quickwit_common::spawn_named_task; use quickwit_config::RetentionPolicy; -use quickwit_metastore::checkpoint::IndexCheckpointDelta; use quickwit_metastore::{SplitMetadata, StageSplitsRequestExt}; use quickwit_proto::metastore::{MetastoreService, MetastoreServiceClient, StageSplitsRequest}; use quickwit_proto::search::{ReportSplit, ReportSplitsRequest}; -use quickwit_proto::types::{IndexUid, PublishToken}; use quickwit_storage::SplitPayloadBuilder; use serde::Serialize; use tokio::sync::oneshot::Sender; @@ -40,10 +36,10 @@ use tracing::{Instrument, Span, debug, info, instrument, warn}; use crate::actors::Publisher; use crate::actors::sequencer::{Sequencer, SequencerCommand}; -use crate::merge_policy::{MergePolicy, MergeTask}; +use crate::merge_policy::MergePolicy; use crate::metrics::INDEXER_METRICS; use crate::models::{ - EmptySplit, PackagedSplit, PackagedSplitBatch, PublishLock, SplitsUpdate, create_split_metadata, + EmptySplit, PackagedSplit, PackagedSplitBatch, SplitsUpdate, create_split_metadata, }; use crate::split_store::IndexingSplitStore; @@ -370,6 +366,7 @@ impl Handler for Uploader { event_broker.publish(ReportSplitsRequest { report_splits }); + let mut replaced_splits = Vec::new(); for (packaged_split, metadata) in batch.splits.into_iter().zip(split_metadata_list) { let upload_result = upload_split( &packaged_split, @@ -385,18 +382,24 @@ impl Handler for Uploader { return; } + replaced_splits.extend(packaged_split.split_attrs.replaced_splits.iter().cloned()); packaged_splits_and_metadata.push((packaged_split, metadata)); } - let splits_update = make_publish_operation( + assert!(!packaged_splits_and_metadata.is_empty()); + let splits_update = SplitsUpdate { index_uid, - packaged_splits_and_metadata, - batch.checkpoint_delta_opt, - batch.publish_lock, - batch.publish_token_opt, - batch.merge_task_opt, - batch.batch_parent_span, - ); + new_splits: packaged_splits_and_metadata + .into_iter() + .map(|split_and_meta| split_and_meta.1) + .collect_vec(), + checkpoint_delta_opt: batch.checkpoint_delta_opt, + publish_lock: batch.publish_lock, + publish_token_opt: batch.publish_token_opt, + merge_task: batch.merge_task_opt, + parent_span: batch.batch_parent_span, + replaced_splits, + }; let target = match &split_update_sender { SplitsUpdateSender::Sequencer(_) => "sequencer", @@ -439,12 +442,12 @@ impl Handler for Uploader { let splits_update = SplitsUpdate { index_uid: empty_split.index_uid, new_splits: Vec::new(), - replaced_split_ids: Vec::new(), checkpoint_delta_opt: Some(empty_split.checkpoint_delta), publish_lock: empty_split.publish_lock, publish_token_opt: empty_split.publish_token_opt, merge_task: None, parent_span: empty_split.batch_parent_span, + replaced_splits: Vec::new(), }; split_update_sender.send(splits_update, ctx).await?; @@ -452,35 +455,6 @@ impl Handler for Uploader { } } -fn make_publish_operation( - index_uid: IndexUid, - packaged_splits_and_metadatas: Vec<(PackagedSplit, SplitMetadata)>, - checkpoint_delta_opt: Option, - publish_lock: PublishLock, - publish_token_opt: Option, - merge_task: Option, - parent_span: Span, -) -> SplitsUpdate { - assert!(!packaged_splits_and_metadatas.is_empty()); - let replaced_split_ids = packaged_splits_and_metadatas - .iter() - .flat_map(|(split, _)| split.split_attrs.replaced_split_ids.clone()) - .collect::>(); - SplitsUpdate { - index_uid, - new_splits: packaged_splits_and_metadatas - .into_iter() - .map(|split_and_meta| split_and_meta.1) - .collect_vec(), - replaced_split_ids: Vec::from_iter(replaced_split_ids), - checkpoint_delta_opt, - publish_lock, - publish_token_opt, - merge_task, - parent_span, - } -} - #[instrument( level = "info" name = "upload", @@ -512,6 +486,7 @@ async fn upload_split( #[cfg(test)] mod tests { + use std::collections::BTreeSet; use std::path::PathBuf; use std::time::Duration; @@ -520,14 +495,14 @@ mod tests { use quickwit_common::temp_dir::TempDirectory; use quickwit_metastore::checkpoint::{IndexCheckpointDelta, SourceCheckpointDelta}; use quickwit_proto::metastore::{EmptyResponse, MockMetastoreService}; - use quickwit_proto::types::{DocMappingUid, NodeId}; + use quickwit_proto::types::{DocMappingUid, IndexUid, NodeId}; use quickwit_storage::RamStorage; use tantivy::DateTime; use tokio::sync::oneshot; use super::*; use crate::merge_policy::{NopMergePolicy, default_merge_policy}; - use crate::models::{SplitAttrs, SplitsUpdate}; + use crate::models::{PublishLock, ReplacedSplit, SplitAttrs, SplitsUpdate}; #[tokio::test] async fn test_uploader_with_sequencer() -> anyhow::Result<()> { @@ -590,10 +565,10 @@ mod tests { secondary_time_range: None, uncompressed_docs_size_in_bytes: 1_000, num_docs: 10, - replaced_split_ids: Vec::new(), split_id: "test-split".to_string(), delete_opstamp: 10, num_merge_ops: 0, + replaced_splits: Vec::new(), }, serialized_split_fields: Vec::new(), split_scratch_directory, @@ -627,7 +602,6 @@ mod tests { index_uid, new_splits, checkpoint_delta_opt, - replaced_split_ids, .. } = publisher_message; @@ -640,7 +614,6 @@ mod tests { checkpoint_delta.source_delta, SourceCheckpointDelta::from_range(3..15) ); - assert!(replaced_split_ids.is_empty()); let mut files = ram_storage.list_files().await; files.sort(); assert_eq!(&files, &[PathBuf::from("test-split.split")]); @@ -703,12 +676,12 @@ mod tests { ..=DateTime::from_timestamp_secs(1_628_203_640), ), secondary_time_range: None, - replaced_split_ids: vec![ - "replaced-split-1".to_string(), - "replaced-split-2".to_string(), - ], delete_opstamp: 0, num_merge_ops: 0, + replaced_splits: Vec::from([ReplacedSplit { + split_id: "replaced-split-1".to_string(), + soft_deleted_doc_ids: BTreeSet::new(), + }]), }, serialized_split_fields: Vec::new(), split_scratch_directory: split_scratch_directory_1, @@ -731,12 +704,12 @@ mod tests { ..=DateTime::from_timestamp_secs(1_628_203_640), ), secondary_time_range: None, - replaced_split_ids: vec![ - "replaced-split-1".to_string(), - "replaced-split-2".to_string(), - ], delete_opstamp: 0, num_merge_ops: 0, + replaced_splits: Vec::from([ReplacedSplit { + split_id: "replaced-split-2".to_string(), + soft_deleted_doc_ids: BTreeSet::new(), + }]), }, serialized_split_fields: Vec::new(), split_scratch_directory: split_scratch_directory_2, @@ -772,21 +745,26 @@ mod tests { let SplitsUpdate { index_uid, new_splits, - mut replaced_split_ids, checkpoint_delta_opt, + replaced_splits, .. } = publisher_message; assert_eq!(index_uid.index_id, "test-index"); // Sort first to avoid test failing. - replaced_split_ids.sort(); assert_eq!(new_splits.len(), 2); assert_eq!(new_splits[0].split_id(), "test-split-1"); assert_eq!(new_splits[1].split_id(), "test-split-2"); assert_eq!( - &replaced_split_ids, - &[ - "replaced-split-1".to_string(), - "replaced-split-2".to_string() + &replaced_splits, + &vec![ + ReplacedSplit { + split_id: "replaced-split-1".to_string(), + soft_deleted_doc_ids: BTreeSet::new(), + }, + ReplacedSplit { + split_id: "replaced-split-2".to_string(), + soft_deleted_doc_ids: BTreeSet::new(), + }, ] ); assert!(checkpoint_delta_opt.is_none()); @@ -855,9 +833,9 @@ mod tests { secondary_time_range: None, uncompressed_docs_size_in_bytes: 1_000, num_docs: 10, - replaced_split_ids: Vec::new(), delete_opstamp: 10, num_merge_ops: 0, + replaced_splits: Vec::new(), }, serialized_split_fields: Vec::new(), split_scratch_directory, @@ -879,13 +857,13 @@ mod tests { let SplitsUpdate { index_uid, new_splits, - replaced_split_ids, + replaced_splits, .. } = publisher_inbox.recv_typed_message().await.unwrap(); assert_eq!(index_uid.index_id, "test-index"); assert_eq!(new_splits.len(), 1); - assert!(replaced_split_ids.is_empty()); + assert!(replaced_splits.is_empty()); universe.assert_quit().await; Ok(()) } @@ -943,7 +921,7 @@ mod tests { index_uid, new_splits, checkpoint_delta_opt, - replaced_split_ids, + replaced_splits, .. } = publisher_message; @@ -955,7 +933,7 @@ mod tests { checkpoint_delta.source_delta, SourceCheckpointDelta::from_range(3..15) ); - assert!(replaced_split_ids.is_empty()); + assert!(replaced_splits.is_empty()); let files = ram_storage.list_files().await; assert!(files.is_empty()); universe.assert_quit().await; @@ -1037,10 +1015,10 @@ mod tests { secondary_time_range: None, uncompressed_docs_size_in_bytes: 1_000, num_docs: 10, - replaced_split_ids: Vec::new(), split_id: SPLIT_ULID_STR.to_string(), delete_opstamp: 10, num_merge_ops: 0, + replaced_splits: Vec::new(), }, serialized_split_fields: Vec::new(), split_scratch_directory, diff --git a/quickwit/quickwit-indexing/src/controlled_directory.rs b/quickwit/quickwit-indexing/src/controlled_directory.rs index b209b4888d6..86e4a5fce0e 100644 --- a/quickwit/quickwit-indexing/src/controlled_directory.rs +++ b/quickwit/quickwit-indexing/src/controlled_directory.rs @@ -93,7 +93,7 @@ impl Directory for ControlledDirectory { self.check_if_alive() .map_err(|io_err| OpenWriteError::wrap_io_error(io_err, path.to_path_buf()))?; - let underlying_wrt: Box = self + let underlying_wrt: Box = self .underlying .open_write(path)? .into_inner() @@ -154,7 +154,9 @@ impl IoControlsAccess for HotswappableIoControls { } // Wrapper to work around the orphan rule. (hence the word "Adopted"). -struct AdoptedControlledWrite(ControlledWrite>); +struct AdoptedControlledWrite( + ControlledWrite>, +); impl io::Write for AdoptedControlledWrite { fn write(&mut self, buf: &[u8]) -> io::Result { diff --git a/quickwit/quickwit-indexing/src/lib.rs b/quickwit/quickwit-indexing/src/lib.rs index 2c2b28a09d7..9183fda3890 100644 --- a/quickwit/quickwit-indexing/src/lib.rs +++ b/quickwit/quickwit-indexing/src/lib.rs @@ -35,9 +35,12 @@ pub use crate::split_store::{IndexingSplitStore, get_tantivy_directory_from_spli pub mod actors; mod controlled_directory; +pub mod mature_merge; +pub mod mature_merge_plan; pub mod merge_policy; mod metrics; pub mod models; +mod soft_delete_query; pub mod source; mod split_store; #[cfg(any(test, feature = "testsuite"))] diff --git a/quickwit/quickwit-indexing/src/mature_merge.rs b/quickwit/quickwit-indexing/src/mature_merge.rs new file mode 100644 index 00000000000..c7486e2e561 --- /dev/null +++ b/quickwit/quickwit-indexing/src/mature_merge.rs @@ -0,0 +1,858 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use anyhow::{Context, bail}; +use bytesize::ByteSize; +use futures::StreamExt; +use quickwit_actors::{ActorExitStatus, Universe}; +use quickwit_common::io::IoControls; +use quickwit_common::{KillSwitch, temp_dir}; +use quickwit_metastore::{ + IndexMetadata, ListIndexesMetadataResponseExt, ListSplitsQuery, ListSplitsRequestExt, + MetastoreServiceStreamSplitsExt, SplitState, +}; +use quickwit_proto::indexing::MergePipelineId; +use quickwit_proto::metastore::{ + ListIndexesMetadataRequest, ListSplitsRequest, MetastoreService, MetastoreServiceClient, +}; +use quickwit_proto::types::NodeId; +use quickwit_storage::StorageResolver; +use tantivy::Inventory; +use time::OffsetDateTime; +use tokio::sync::Semaphore; +use tracing::{info, warn}; + +use crate::actors::{ + MergeExecutor, MergePermit, MergeSplitDownloader, Packager, Publisher, PublisherType, Uploader, + UploaderType, +}; +use crate::mature_merge_plan::{MATURITY_BUFFER, plan_merge_operations_for_index}; +use crate::merge_policy::{MergeOperation, MergeTask, NopMergePolicy}; +use crate::split_store::{IndexingSplitCache, IndexingSplitStore}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct MatureMergeConfig { + /// Splits within this many days of the retention cutoff are left untouched. + pub retention_safety_buffer_days: u64, + /// Minimum number of splits in a group before a merge operation is emitted. + pub min_merge_group_size: usize, + /// Maximum number of docs in a split for it to be eligible for mature merging. + pub input_split_max_num_docs: usize, + /// Maximum number of splits per merge operation. + pub max_merge_group_size: usize, + /// Maximum total number of documents per merge operation. + pub split_target_num_docs: usize, + /// Focus on splits that span this many days. + pub split_timestamp_days_range: u8, + /// Number of indexes processed concurrently. Lower to avoid fetching splits + /// metadata too eagerly. + pub index_parallelism: usize, + /// Maximum number of merges running concurrently across all indexes. + pub max_concurrent_merges: usize, + /// Print planned operations without executing them. + pub dry_run: bool, + /// List of index patterns to include in the mature merge process. + pub index_id_patterns: Vec, +} + +impl Default for MatureMergeConfig { + fn default() -> Self { + Self { + retention_safety_buffer_days: 5, + min_merge_group_size: 5, + input_split_max_num_docs: 10_000, + max_merge_group_size: 100, + split_target_num_docs: 5_000_000, + split_timestamp_days_range: 0, // by default single day splits + index_parallelism: 50, + max_concurrent_merges: 10, + dry_run: false, + index_id_patterns: vec!["*".to_string()], + } + } +} + +/// Statistics for the merges performed on a single index. +#[derive(Debug, Default)] +struct IndexMergeOutcome { + num_published_merges: u64, + num_replaced_splits: u64, +} + +struct IndexMergeSummary { + num_merges_planned: usize, + num_input_splits: usize, + total_input_bytes: u64, + outcome: IndexMergeOutcome, +} + +/// Fetches all published splits for the given index from the metastore (no +/// node-id filter) and calls [`plan_merge_operations_for_index`]. +async fn fetch_splits_and_plan( + index_metadata: &IndexMetadata, + metastore: &MetastoreServiceClient, + now: OffsetDateTime, + config: &MatureMergeConfig, +) -> anyhow::Result> { + let index_uid = index_metadata.index_uid.clone(); + let list_splits_query = ListSplitsQuery::for_index(index_uid) + .with_split_state(SplitState::Published) + .retain_mature(now - MATURITY_BUFFER); + let list_splits_request = ListSplitsRequest::try_from_list_splits_query(&list_splits_query)?; + let splits_stream = metastore.list_splits(list_splits_request).await?; + let splits = splits_stream.collect_splits_metadata().await?; + + if splits.iter().any(|s| !s.tags.is_empty()) { + // with tags and doc mapping evolutions, we might have weird edge cases + // -> just refuse them for now + bail!("tags not supported in mature merges") + } + + let total_splits = splits.len(); + let operations = + plan_merge_operations_for_index(&index_metadata.index_config, splits, now, config); + + info!( + index_id = %index_metadata.index_config.index_id, + total_splits, + num_planned_merges = operations.len(), + "fetched splits for mature merge planning" + ); + Ok(operations) +} + +/// Executes the given merge operations for a single index using the standard +/// actor pipeline: `MergeSplitDownloader -> MergeExecutor -> Packager -> +/// Uploader -> Publisher`. +/// +/// Tags are not supported and we use the default tokenizer manager. In practice +/// we could use the tags and custom tokenizers from the current doc mapping, +/// but schema evolutions could lead to un-anticipated edge cases. +#[allow(clippy::too_many_arguments)] +async fn run_mature_merges_for_index( + index_metadata: &IndexMetadata, + operations: Vec, + metastore: MetastoreServiceClient, + split_store: IndexingSplitStore, + semaphore: Arc, + data_dir_path: &std::path::Path, + config: &MatureMergeConfig, + node_id: NodeId, +) -> anyhow::Result { + if operations.is_empty() { + return Ok(IndexMergeOutcome { + num_published_merges: 0, + num_replaced_splits: 0, + }); + } + + let index_config = &index_metadata.index_config; + let index_uid = index_metadata.index_uid.clone(); + + let indexing_directory = temp_dir::Builder::default() + .join("mature-merge") + .tempdir_in(data_dir_path) + .context("failed to create temp directory for mature merge")?; + + let pipeline_id = MergePipelineId { + node_id, + index_uid, + source_id: "_mature_merge".to_string(), + }; + + let universe = Universe::new(); + let kill_switch = KillSwitch::default(); + + // Build chain from publisher inward (each actor gets the next actor's mailbox). + + let merge_publisher = Publisher::new( + PublisherType::MergePublisher, + metastore.clone(), + // No feedback loop to a merge planner. + None, + None, + ); + let (merge_publisher_mailbox, merge_publisher_handle) = universe + .spawn_builder() + .set_kill_switch(kill_switch.clone()) + .spawn(merge_publisher); + + let merge_uploader = Uploader::new( + UploaderType::MergeUploader, + metastore.clone(), + Arc::new(NopMergePolicy), + index_config.retention_policy_opt.clone(), + split_store.clone(), + merge_publisher_mailbox.into(), + config.max_concurrent_merges, + Default::default(), + ); + let (merge_uploader_mailbox, merge_uploader_handle) = universe + .spawn_builder() + .set_kill_switch(kill_switch.clone()) + .spawn(merge_uploader); + + // Tag fields not supported for now + let tag_fields = Vec::new(); + let merge_packager = Packager::new("MaturePackager", tag_fields, merge_uploader_mailbox); + let (merge_packager_mailbox, merge_packager_handle) = universe + .spawn_builder() + .set_kill_switch(kill_switch.clone()) + .spawn(merge_packager); + + let merge_executor = MergeExecutor::new_with_tokenizers_only( + pipeline_id, + metastore, + // we only support the default tokenizer manager + quickwit_query::create_default_quickwit_tokenizer_manager(), + IoControls::default().set_component("mature_merger"), + merge_packager_mailbox, + ); + let (merge_executor_mailbox, merge_executor_handle) = universe + .spawn_builder() + .set_kill_switch(kill_switch.clone()) + .spawn(merge_executor); + + let merge_split_downloader = MergeSplitDownloader { + scratch_directory: indexing_directory, + split_store, + executor_mailbox: merge_executor_mailbox, + io_controls: IoControls::default().set_component("mature_split_downloader"), + }; + let (merge_split_downloader_mailbox, merge_split_downloader_handle) = universe + .spawn_builder() + .set_kill_switch(kill_switch.clone()) + .spawn(merge_split_downloader); + + // Send all merge tasks to the downloader, gated by the concurrency semaphore. + let inventory: Inventory = Inventory::default(); + for operation in operations { + let permit = Arc::clone(&semaphore) + .acquire_owned() + .await + .expect("semaphore should not be closed"); + let merge_task = MergeTask { + merge_operation: inventory.track(operation), + _merge_permit: MergePermit::new(permit), + }; + if merge_split_downloader_mailbox + .send_message(merge_task) + .await + .is_err() + { + anyhow::bail!("merge split downloader actor died unexpectedly"); + } + } + + // Dropping the downloader mailbox signals no more tasks are coming. + // The pipeline will cascade-exit once all pending tasks are processed. + drop(merge_split_downloader_mailbox); + + let (downloader_status, _) = merge_split_downloader_handle.join().await; + let (executor_status, _) = merge_executor_handle.join().await; + let (packager_status, _) = merge_packager_handle.join().await; + let (uploader_status, _) = merge_uploader_handle.join().await; + let (publisher_status, publisher_counters) = merge_publisher_handle.join().await; + + universe.quit().await; + + for (name, status) in [ + ("downloader", downloader_status), + ("executor", executor_status), + ("packager", packager_status), + ("uploader", uploader_status), + ("publisher", publisher_status), + ] { + if !matches!(status, ActorExitStatus::Success | ActorExitStatus::Quit) { + anyhow::bail!( + "mature merge actor `{}` exited with unexpected status: {:?}", + name, + status + ); + } + } + + Ok(IndexMergeOutcome { + num_published_merges: publisher_counters.num_replace_operations, + num_replaced_splits: publisher_counters.num_replaced_splits, + }) +} + +/// Plans and optionally executes mature merges for a single index +#[allow(clippy::too_many_arguments)] +async fn merge_mature_single_index( + index_metadata: IndexMetadata, + metastore: &MetastoreServiceClient, + storage_resolver: &StorageResolver, + semaphore: Arc, + data_dir_path: &std::path::Path, + config: &MatureMergeConfig, + node_id: NodeId, + now: OffsetDateTime, +) -> anyhow::Result { + let index_id = index_metadata.index_config.index_id.clone(); + let operations = fetch_splits_and_plan(&index_metadata, metastore, now, config).await?; + let num_merges_planned = operations.len(); + let num_input_splits: usize = operations.iter().map(|op| op.splits.len()).sum(); + let total_input_bytes: u64 = operations + .iter() + .flat_map(|op| op.splits.iter()) + .map(|s| s.uncompressed_docs_size_in_bytes) + .sum(); + + if config.dry_run { + for op in &operations { + log_op_for_dry_run(op, &index_metadata.index_config.index_id); + } + return Ok(IndexMergeSummary { + num_merges_planned, + num_input_splits, + total_input_bytes, + outcome: IndexMergeOutcome::default(), + }); + } + + if operations.is_empty() { + return Ok(IndexMergeSummary { + num_merges_planned: 0, + total_input_bytes: 0, + num_input_splits: 0, + outcome: IndexMergeOutcome::default(), + }); + } + + let index_uri = index_metadata.index_uri(); + let remote_storage = storage_resolver + .resolve(index_uri) + .await + .context("failed to resolve index storage")?; + let split_store = + IndexingSplitStore::new(remote_storage, Arc::new(IndexingSplitCache::no_caching())); + + let outcome = run_mature_merges_for_index( + &index_metadata, + operations, + metastore.clone(), + split_store, + semaphore, + data_dir_path, + config, + node_id, + ) + .await?; + + if num_merges_planned > 0 { + info!( + index_id = %index_id, + planned = num_merges_planned, + published_merges = outcome.num_published_merges, + replaced_splits = outcome.num_replaced_splits, + input_splits = num_input_splits, + input_bytes = total_input_bytes, + "mature split merges complete for index" + ); + } + + Ok(IndexMergeSummary { + num_merges_planned, + num_input_splits, + total_input_bytes, + outcome, + }) +} + +/// Aggregates per-index results, logs per-index and global summary lines, and warns on errors. +fn log_merge_results(results: Vec>, dry_run: bool) { + let mut total_planned_merges = 0usize; + let mut total_input_splits = 0usize; + let mut total_input_bytes = 0u64; + let mut total_successfully_published_merges = 0u64; + let mut total_successfully_replaced_splits = 0u64; + + let mut num_indexes_successfully_merged = 0usize; + let mut num_indexes_partially_merged = 0usize; + let mut num_indexes_without_opportunity = 0usize; + + for result in results { + match result { + Ok(summary) => { + total_planned_merges += summary.num_merges_planned; + total_input_splits += summary.num_input_splits; + total_input_bytes += summary.total_input_bytes; + total_successfully_published_merges += summary.outcome.num_published_merges; + total_successfully_replaced_splits += summary.outcome.num_replaced_splits; + + if summary.num_merges_planned == 0 { + num_indexes_without_opportunity += 1; + } else if summary.outcome.num_published_merges + == (summary.num_merges_planned as u64) + { + num_indexes_successfully_merged += 1; + } else { + num_indexes_partially_merged += 1; + } + } + Err(err) => { + warn!(err = ?err, "error processing index during mature merge"); + } + } + } + if dry_run { + info!( + num_indexes_with_opportunities = num_indexes_partially_merged, + num_indexes_without_opportunity, + total_planned_merges, + total_input_splits, + total_input_bytes, + "mature merge dry-run complete" + ); + } else { + info!( + num_indexes_successfully_merged, + num_indexes_partially_merged, + num_indexes_without_opportunity, + total_planned_merges, + total_successfully_published_merges, + total_successfully_replaced_splits, + total_input_splits, + total_input_bytes, + "mature merge complete" + ); + } +} + +fn log_op_for_dry_run(op: &MergeOperation, index_id: &str) { + let start_time = op + .splits + .iter() + .filter_map(|s| s.time_range.as_ref().map(|r| r.start())) + .min() + .unwrap_or(&0); + let end_time = op + .splits + .iter() + .filter_map(|s| s.time_range.as_ref().map(|r| r.end())) + .max() + .unwrap_or(&0); + let fmt_ts = |ts: i64| { + OffsetDateTime::from_unix_timestamp(ts) + .map(|dt| { + format!( + "{}-{:02}-{:02}T{:02}", + dt.year(), + dt.month() as u8, + dt.day(), + dt.hour() + ) + }) + .unwrap_or_else(|_| ts.to_string()) + }; + // print is better than log because dry-run will be used interactively from the CLI + println!( + "[dry-run] {index_id}: {} splits | {} docs | {} | {} → {}", + op.splits.len(), + op.splits.iter().map(|s| s.num_docs).sum::(), + ByteSize(op.splits.iter().map(|s| s.footer_offsets.end).sum::()), + fmt_ts(*start_time), + fmt_ts(*end_time), + ); +} + +/// Processes all indexes from the metastore, discovering and running mature +/// merge opportunities. +/// +/// If `dry_run` is `true`, the planned operations are printed but not executed. +pub async fn merge_mature_all_indexes( + metastore: MetastoreServiceClient, + storage_resolver: StorageResolver, + data_dir_path: &std::path::Path, + config: MatureMergeConfig, + node_id: NodeId, +) -> anyhow::Result<()> { + let indexes_metadata = metastore + .list_indexes_metadata(ListIndexesMetadataRequest { + index_id_patterns: config.index_id_patterns.clone(), + }) + .await + .context("failed to list indexes")? + .deserialize_indexes_metadata() + .await + .context("failed to deserialize indexes metadata")?; + + info!( + num_indexes = indexes_metadata.len(), + "starting mature merge" + ); + + let semaphore = Arc::new(Semaphore::new(config.max_concurrent_merges)); + let metastore_ref = &metastore; + let storage_resolver_ref = &storage_resolver; + let config_ref = &config; + + if indexes_metadata + .iter() + .any(|m| !m.index_config.doc_mapping.tag_fields.is_empty()) + { + // with tags and doc mapping evolutions, we might have weird edge cases + // -> just refuse them for now + bail!("tags not supported in mature merges"); + } + + let results: Vec> = futures::stream::iter(indexes_metadata) + .map(|index_metadata| { + let node_id = node_id.clone(); + let semaphore = Arc::clone(&semaphore); + async move { + let now = OffsetDateTime::now_utc(); + merge_mature_single_index( + index_metadata, + metastore_ref, + storage_resolver_ref, + semaphore, + data_dir_path, + config_ref, + node_id, + now, + ) + .await + } + }) + .buffer_unordered(config.index_parallelism) + .collect() + .await; + + log_merge_results(results, config.dry_run); + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use quickwit_common::temp_dir::TempDirectory; + use quickwit_config::ConfigFormat; + use quickwit_metastore::{ + IndexMetadata, IndexMetadataResponseExt, SplitMaturity, SplitMetadata, + UpdateIndexRequestExt, + }; + use quickwit_proto::metastore::{ + IndexMetadataRequest, ListSplitsRequest, MetastoreService, MetastoreServiceClient, + MockMetastoreService, UpdateIndexRequest, + }; + use quickwit_proto::types::NodeId; + use quickwit_storage::RamStorage; + + use super::*; + use crate::TestSandbox; + + /// Tests the short-circuit path: when no merge operations are planned, + /// `run_mature_merges_for_index` returns 0 immediately without spawning any actors. + #[tokio::test] + async fn test_run_mature_merges_for_index_no_operations() -> anyhow::Result<()> { + let mock_metastore = MockMetastoreService::new(); + let storage = Arc::new(RamStorage::default()); + let split_store = IndexingSplitStore::create_without_local_store_for_test(storage); + let index_metadata = IndexMetadata::for_test("test-index", "ram:///test-index"); + let data_dir = TempDirectory::for_test(); + let node_id = NodeId::from("test-node"); + + let semaphore = Arc::new(Semaphore::new(2)); + let outcome = run_mature_merges_for_index( + &index_metadata, + vec![], + MetastoreServiceClient::from_mock(mock_metastore), + split_store, + semaphore, + data_dir.path(), + &MatureMergeConfig::default(), + node_id, + ) + .await?; + + assert_eq!(outcome.num_published_merges, 0); + assert_eq!(outcome.num_replaced_splits, 0); + Ok(()) + } + + /// Tests the full per index pipeline end-to-end with a single merge operation + #[tokio::test] + async fn test_run_mature_merges_for_index_merges_real_splits() -> anyhow::Result<()> { + let doc_mapping_yaml = r#" + field_mappings: + - name: body + type: text + - name: ts + type: datetime + input_formats: [unix_timestamp] + fast: true + timestamp_field: ts + "#; + let test_sandbox = + TestSandbox::create("test-index-mature2", doc_mapping_yaml, "", &["body"]).await?; + + // each add_documents() call produces 1 split + for i in 0..4u64 { + test_sandbox + .add_documents(std::iter::once( + serde_json::json!({"body": format!("doc{i}"), "ts": 1_631_072_713u64 + i}), + )) + .await?; + } + + let metastore = test_sandbox.metastore(); + let index_uid = test_sandbox.index_uid(); + + let split_metas: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await? + .collect_splits_metadata() + .await?; + assert_eq!(split_metas.len(), 4); + + let index_metadata = metastore + .index_metadata(IndexMetadataRequest::for_index_id( + index_uid.index_id.to_string(), + )) + .await? + .deserialize_index_metadata()?; + + let merge_op = MergeOperation::new_merge_operation(split_metas); + let split_store = + IndexingSplitStore::create_without_local_store_for_test(test_sandbox.storage()); + let data_dir = TempDirectory::for_test(); + let semaphore = Arc::new(Semaphore::new(2)); + + let outcome = run_mature_merges_for_index( + &index_metadata, + vec![merge_op], + metastore.clone(), + split_store, + semaphore, + data_dir.path(), + &MatureMergeConfig::default(), + test_sandbox.node_id(), + ) + .await?; + + assert_eq!(outcome.num_published_merges, 1); + assert_eq!(outcome.num_replaced_splits, 4); + + // The 4 input splits are now MarkedForDeletion; 1 merged Published split should remain. + let published_after: Vec = metastore + .list_splits(ListSplitsRequest::try_from_list_splits_query( + &ListSplitsQuery::for_index(index_uid).with_split_state(SplitState::Published), + )?) + .await? + .collect_splits_metadata() + .await?; + assert_eq!(published_after.len(), 1); + assert_eq!(published_after[0].num_docs, 4); + assert_eq!(published_after[0].maturity, SplitMaturity::Mature); + assert_eq!( + published_after[0].time_range, + Some(1_631_072_713..=1_631_072_716) + ); + + test_sandbox.assert_quit().await; + Ok(()) + } + + #[tokio::test] + async fn test_merge_mature_single_index_schema_evolution() -> anyhow::Result<()> { + let doc_mapping_v1_yaml = r#" + field_mappings: + - name: ts + type: datetime + input_formats: [unix_timestamp] + fast: true + - name: label + type: text + fast: true + tokenizer: lowercase + timestamp_field: ts + "#; + let test_sandbox = + TestSandbox::create("test-index-schema-evo", doc_mapping_v1_yaml, "", &["label"]) + .await?; + + let base_time = 1_631_072_713i64; // Wednesday, September 8, 2021 at 3:45:13 AM UTC + + // create 3 splits with v1 mapping + for i in 0..3i64 { + test_sandbox + .add_documents(std::iter::once( + serde_json::json!({"label": format!("Doc{i}"), "ts": base_time + i}), + )) + .await?; + } + + let metastore = test_sandbox.metastore(); + let index_uid = test_sandbox.index_uid(); + + let v1_splits: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await? + .collect_splits_metadata() + .await?; + assert_eq!(v1_splits.len(), 3); + let v1_doc_mapping_uid = v1_splits[0].doc_mapping_uid; + + // Update the index config: change tokenizer to `default` and add a secondary timestamp. + let index_metadata_v1 = metastore + .index_metadata(IndexMetadataRequest::for_index_id( + index_uid.index_id.to_string(), + )) + .await? + .deserialize_index_metadata()?; + let doc_mapping_v2 = ConfigFormat::Yaml.parse( + r#" + field_mappings: + - name: ts + type: datetime + input_formats: [unix_timestamp] + fast: true + - name: label + type: text + fast: true + tokenizer: default + - name: ts2 + type: datetime + input_formats: [unix_timestamp] + fast: true + timestamp_field: ts + secondary_timestamp_field: ts2 + "# + .as_bytes(), + )?; + let update_request = UpdateIndexRequest::try_from_updates( + index_uid.clone(), + &doc_mapping_v2, + &index_metadata_v1.index_config.indexing_settings, + &index_metadata_v1.index_config.ingest_settings, + &index_metadata_v1.index_config.search_settings, + &index_metadata_v1.index_config.retention_policy_opt, + )?; + metastore.update_index(update_request).await?; + + // create 3 more splits with v2 mapping + for i in 3..6i64 { + test_sandbox + .add_documents(std::iter::once(serde_json::json!({ + "label": format!("Doc{i}"), + "ts": base_time + i, + "ts2": base_time + i + 1000, + }))) + .await?; + } + + let all_splits: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await? + .collect_splits_metadata() + .await?; + assert_eq!(all_splits.len(), 6); + let v2_doc_mapping_uid = all_splits + .iter() + .find(|s| s.doc_mapping_uid != v1_doc_mapping_uid) + .unwrap() + .doc_mapping_uid; + assert_eq!( + all_splits + .iter() + .filter(|s| s.doc_mapping_uid == v1_doc_mapping_uid) + .count(), + 3 + ); + assert_eq!( + all_splits + .iter() + .filter(|s| s.doc_mapping_uid == v2_doc_mapping_uid) + .count(), + 3 + ); + + let index_metadata_v2 = metastore + .index_metadata(IndexMetadataRequest::for_index_id( + index_uid.index_id.to_string(), + )) + .await? + .deserialize_index_metadata()?; + let data_dir = TempDirectory::for_test(); + let semaphore = Arc::new(Semaphore::new(2)); + // Splits have the default 48h maturation period. Pass a `now` far enough in the future + // so all splits (both v1 and v2) are mature at `now - MATURITY_BUFFER (6h)`. + let now = OffsetDateTime::now_utc() + time::Duration::days(3); + // Override min_merge_group_size to 2 so that 3-split groups qualify. + let config = MatureMergeConfig { + min_merge_group_size: 2, + ..MatureMergeConfig::default() + }; + + let summary = merge_mature_single_index( + index_metadata_v2, + &metastore, + &test_sandbox.storage_resolver(), + semaphore, + data_dir.path(), + &config, + test_sandbox.node_id(), + now, + ) + .await?; + + // Both the v1 and v2 groups (3 splits each, different doc_mapping_uid) get merged. + assert_eq!(summary.num_merges_planned, 2); + assert_eq!(summary.outcome.num_published_merges, 2); + assert_eq!(summary.outcome.num_replaced_splits, 6); + + let published_after: Vec = metastore + .list_splits(ListSplitsRequest::try_from_list_splits_query( + &ListSplitsQuery::for_index(index_uid).with_split_state(SplitState::Published), + )?) + .await? + .collect_splits_metadata() + .await?; + assert_eq!(published_after.len(), 2); + + // The merged v1 split preserves the original doc_mapping_uid, time range, and has no + // secondary_time_range because the v1 schema had no secondary timestamp field. + let merged_v1 = published_after + .iter() + .find(|s| s.doc_mapping_uid == v1_doc_mapping_uid) + .expect("merged v1 split must exist"); + assert_eq!(merged_v1.num_docs, 3); + assert_eq!(merged_v1.maturity, SplitMaturity::Mature); + assert_eq!(merged_v1.time_range, Some(base_time..=base_time + 2)); + assert_eq!(merged_v1.secondary_time_range, None); + + // The merged v2 split has the updated doc_mapping_uid and a secondary_time_range + // derived from the ts2 field. + let merged_v2 = published_after + .iter() + .find(|s| s.doc_mapping_uid == v2_doc_mapping_uid) + .expect("merged v2 split must exist"); + assert_eq!(merged_v2.num_docs, 3); + assert_eq!(merged_v2.maturity, SplitMaturity::Mature); + assert_eq!(merged_v2.time_range, Some(base_time + 3..=base_time + 5)); + assert_eq!( + merged_v2.secondary_time_range, + Some(base_time + 1003..=base_time + 1005) + ); + + test_sandbox.assert_quit().await; + Ok(()) + } +} diff --git a/quickwit/quickwit-indexing/src/mature_merge_plan.rs b/quickwit/quickwit-indexing/src/mature_merge_plan.rs new file mode 100644 index 00000000000..e71736d1891 --- /dev/null +++ b/quickwit/quickwit-indexing/src/mature_merge_plan.rs @@ -0,0 +1,494 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::time::Duration; + +use quickwit_config::IndexConfig; +use quickwit_metastore::SplitMetadata; +use time::OffsetDateTime; + +use crate::mature_merge::MatureMergeConfig; +use crate::merge_policy::MergeOperation; + +pub const SECS_PER_DAY: i64 = 60 * 60 * 24; + +/// Wait a couple of hours after the split got mature to be extra sure no merge +/// process is still running on it. +pub const MATURITY_BUFFER: Duration = Duration::from_hours(6); + +/// Computes the earliest UTC-day midnight (seconds since epoch) that is safe to merge, +/// given the index's retention policy and the current time. +fn retention_safety_cutoff_secs( + index_config: &IndexConfig, + now_secs: i64, + config: &MatureMergeConfig, +) -> Option { + let retention_policy = index_config.retention_policy_opt.as_ref()?; + let period = retention_policy.retention_period().ok()?; + let retention_safety_buffer = Duration::from_hours(config.retention_safety_buffer_days * 24); + if period <= retention_safety_buffer { + // No safe window: exclude every split by returning a cutoff in the far future. + return Some(i64::MAX); + } + let cutoff_raw = now_secs - period.as_secs() as i64 + retention_safety_buffer.as_secs() as i64; + // Round up to the next day boundary so we never partially exclude a day bucket. + Some((cutoff_raw / SECS_PER_DAY + 1) * SECS_PER_DAY) +} + +/// Converts a single day-bucket group of eligible splits into one or more balanced +/// [`MergeOperation`]s respecting constraints. +fn plan_operations_for_group( + mut group_splits: Vec, + config: &MatureMergeConfig, +) -> Vec { + if group_splits.len() < config.min_merge_group_size { + return Vec::new(); + } + // Sort ascending by end time so each sub-operation covers the most compact range. + group_splits.sort_by_key(|s| s.time_range.as_ref().map(|r| *r.end()).unwrap_or(0)); + + let n = group_splits.len(); + let total_docs: usize = group_splits.iter().map(|s| s.num_docs).sum(); + + // Minimum number of balanced operations needed to respect both per-operation limits. + let k = n + .div_ceil(config.max_merge_group_size) + .max(total_docs.div_ceil(config.split_target_num_docs)) + .max(1); + + // Divide into k balanced chunks (first chunks are ≥ last chunks by at most 1 split). + let chunk_size = n.div_ceil(k); + group_splits + .chunks(chunk_size) + .filter(|chunk| chunk.len() >= config.min_merge_group_size) + .map(|chunk| MergeOperation::new_merge_operation(chunk.to_vec())) + .collect() +} + +/// Group by UTC day (floored to midnight in seconds) of the split's time range, +/// and returns one or more [`MergeOperation`]s per group that meets the size +/// threshold. +/// +/// Rules: +/// - Splits without a `time_range` are skipped (cannot assign a day). +/// - A split is only assigned to a bucket when the UTC day number of `time_range.end()` minus the +/// UTC day number of `time_range.start()` equals the configured number of days. +/// - Immature splits are excluded. +/// - Splits whose `time_range.end()` falls within the retention safety buffer are excluded. +/// +/// Important: This plan merges splits accross sources. It can be problematic if +/// the IndexingSettings are different (e.g different maturation period), which +/// was made possible on Kafka sources by specifying an override in the +/// client_params. +pub fn plan_merge_operations_for_index( + index_config: &IndexConfig, + splits: Vec, + now: OffsetDateTime, + config: &MatureMergeConfig, +) -> Vec { + let now_secs = now.unix_timestamp(); + + let earliest_cutoff_timestamp = retention_safety_cutoff_secs(index_config, now_secs, config); + + // Key: (partition_id, doc_mapping_uid_string, day_bucket_seconds, secondary_day_opt) + let mut groups: HashMap<(u64, String, i64, Option), Vec> = HashMap::new(); + + for split in splits { + // Only splits that have been mature for a while + if !split.is_mature(now - MATURITY_BUFFER) { + continue; + } + + // Enforce the max size for splits to be considered for merging. + if split.num_docs > config.input_split_max_num_docs { + continue; + } + + // The timestamp field is required + let Some(ref time_range) = split.time_range else { + continue; + }; + + let start_day = time_range.start() / SECS_PER_DAY; + let end_day = time_range.end() / SECS_PER_DAY; + + // also group on secondary time range to make sure retention can still be applied + let secondary_day_opt = split + .secondary_time_range + .as_ref() + // In the nominal case, the secondary time (ingest time) is only + // slightly greater than the primary time (event time). Using + // `start()` here decreases the chances of further fragmenting the + // group at the day limits. + .map(|r| r.start() / SECS_PER_DAY); + + // Focus on splits with a specific timestamp range. + if end_day - start_day != (config.split_timestamp_days_range as i64) { + continue; + } + + // Check that we are not too close to the retention cutoff. + if let Some(cutoff) = earliest_cutoff_timestamp + && *time_range.end() < cutoff + { + continue; + } + + let key = ( + split.partition_id, + split.doc_mapping_uid.to_string(), + start_day, + secondary_day_opt, + ); + groups.entry(key).or_default().push(split); + } + + let mut operations = Vec::new(); + for (_key, group_splits) in groups { + operations.extend(plan_operations_for_group(group_splits, config)); + } + operations +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use quickwit_config::{IndexConfig, RetentionPolicy}; + use quickwit_metastore::{SplitMaturity, SplitMetadata}; + use quickwit_proto::types::{DocMappingUid, IndexUid}; + use time::OffsetDateTime; + + use super::*; + + /// Builds a mature [`SplitMetadata`] for use in tests. + /// + /// - `day_bucket`: UTC day expressed as seconds-since-epoch (midnight). For example `day_bucket + /// = 0` means 1970-01-01, `day_bucket = SECS_PER_DAY` means 1970-01-02. + fn mature_split_for_test( + split_id: &str, + index_uid: &IndexUid, + partition_id: u64, + doc_mapping_uid: DocMappingUid, + num_docs: usize, + day_bucket: i64, + ) -> SplitMetadata { + SplitMetadata { + split_id: split_id.to_string(), + index_uid: index_uid.clone(), + partition_id, + num_docs, + doc_mapping_uid, + // Both endpoints on the same UTC day — the split spans one hour. + time_range: Some(day_bucket..=(day_bucket + 3600)), + maturity: SplitMaturity::Mature, + ..Default::default() + } + } + + fn index_config_no_retention() -> IndexConfig { + IndexConfig::for_test("test-index", "s3://test-bucket/test-index") + } + + fn index_config_with_retention(period: &str) -> IndexConfig { + let mut config = index_config_no_retention(); + config.retention_policy_opt = Some(RetentionPolicy { + retention_period: period.to_string(), + evaluation_schedule: "daily".to_string(), + timestamp_type: Default::default(), + }); + config + } + + // UTC day 0 = 1970-01-01. Use a recent-ish day to avoid the retention buffer. + // We use day 20000 (approx 2024-10) so splits are "recent" relative to a "now" we control. + const RECENT_DAY: i64 = 20_000 * SECS_PER_DAY; + + fn now_well_after_recent_day() -> OffsetDateTime { + // 1 day after the splits' day — they are mature but not in a retention buffer. + OffsetDateTime::from_unix_timestamp(RECENT_DAY + SECS_PER_DAY + 1).unwrap() + } + + #[test] + fn test_plan_basic() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + let splits: Vec = (0..10) + .map(|i| { + mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ) + }) + .collect(); + + let operations = plan_merge_operations_for_index( + &index_config_no_retention(), + splits, + now_well_after_recent_day(), + &MatureMergeConfig::default(), + ); + + assert_eq!(operations.len(), 1); + assert_eq!(operations[0].splits.len(), 10); + } + + #[test] + fn test_plan_below_threshold() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + // Only 4 splits — below the min_merge_group_size (5). + let splits: Vec = (0..4) + .map(|i| { + mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ) + }) + .collect(); + + let operations = plan_merge_operations_for_index( + &index_config_no_retention(), + splits, + now_well_after_recent_day(), + &MatureMergeConfig { + min_merge_group_size: 5, + ..Default::default() + }, + ); + + assert!(operations.is_empty(), "expected no operations for 4 splits"); + } + + #[test] + fn test_plan_immature_splits_excluded() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + let now = now_well_after_recent_day(); + let now_ts = now.unix_timestamp(); + + // All splits are immature (maturation period far in the future). + let splits: Vec = (0..10) + .map(|i| { + let mut split = mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ); + split.maturity = SplitMaturity::Immature { + maturation_period: Duration::from_secs(999_999), + }; + // Make sure create_timestamp is recent so the split is truly immature. + split.create_timestamp = now_ts; + split + }) + .collect(); + + let operations = plan_merge_operations_for_index( + &index_config_no_retention(), + splits, + now, + &MatureMergeConfig::default(), + ); + + assert!(operations.is_empty(), "immature splits should be excluded"); + } + + #[test] + fn test_plan_multiday_split_skipped() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + + // 10 splits, but each one spans midnight (start on day N, end on day N+1). + let splits: Vec = (0..10) + .map(|i| { + let mut split = mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ); + // Extend time_range to cross midnight. + split.time_range = Some(RECENT_DAY - 3600..=RECENT_DAY + 3600); + split + }) + .collect(); + + let operations = plan_merge_operations_for_index( + &index_config_no_retention(), + splits, + now_well_after_recent_day(), + &MatureMergeConfig::default(), + ); + + assert!(operations.is_empty(), "multi-day splits should be skipped"); + } + + #[test] + fn test_plan_retention_safety_buffer() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + + // Retention period = 90 days. Safety buffer = 30 days. + // Splits must have time_range.end >= now - 90d + 30d = now - 60d. + // We put splits at RECENT_DAY but set "now" to be RECENT_DAY + 91 days. + // Then: cutoff_raw = (RECENT_DAY + 91d) - 90d + 30d = RECENT_DAY + 31d + // cutoff = RECENT_DAY + 32d (rounded up to next day boundary) + // Because RECENT_DAY + 3600 < cutoff, splits should be excluded. + let now_ts = RECENT_DAY + 91 * SECS_PER_DAY; + let now = OffsetDateTime::from_unix_timestamp(now_ts).unwrap(); + + let splits: Vec = (0..10) + .map(|i| { + mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ) + }) + .collect(); + + let config = index_config_with_retention("90 days"); + + let merge_config = MatureMergeConfig { + retention_safety_buffer_days: 30, + ..MatureMergeConfig::default() + }; + let operations = plan_merge_operations_for_index(&config, splits, now, &merge_config); + + assert!( + operations.is_empty(), + "splits within retention safety buffer should be excluded" + ); + } + + #[test] + fn test_plan_retention_period_too_short_skipped() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + + let splits: Vec = (0..10) + .map(|i| { + mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ) + }) + .collect(); + + // Retention period of 3 days is <= retention_safety_buffer_days (default 5 days) + // so the index should be skipped entirely. + let config = index_config_with_retention("3 days"); + + let operations = plan_merge_operations_for_index( + &config, + splits, + now_well_after_recent_day(), + &MatureMergeConfig::default(), + ); + + assert!( + operations.is_empty(), + "index with short retention should produce no operations" + ); + } + + #[test] + fn test_plan_different_partitions_grouped_separately() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + + // 6 splits per partition, two partitions => 2 separate merge operations. + let splits: Vec = (0..12) + .map(|i| { + mature_split_for_test( + &format!("split-{i}"), + &index_uid, + i as u64 / 6, // partition 0 for i in 0..6, partition 1 for i in 6..12 + doc_mapping_uid, + 100, + RECENT_DAY, + ) + }) + .collect(); + + let mut operations = plan_merge_operations_for_index( + &index_config_no_retention(), + splits, + now_well_after_recent_day(), + &MatureMergeConfig::default(), + ); + operations.sort_by_key(|op| op.splits[0].partition_id); + + assert_eq!(operations.len(), 2); + assert!(operations[0].splits.iter().all(|s| s.partition_id == 0)); + assert!(operations[1].splits.iter().all(|s| s.partition_id == 1)); + } + + #[test] + fn test_plan_split_timestamp_days_range_one() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + + let splits: Vec = (0..60) + .map(|i| { + let mut split = mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ); + split.time_range = Some(RECENT_DAY..=RECENT_DAY + i * 3600); + split + }) + .collect(); + + let config = MatureMergeConfig { + split_timestamp_days_range: 1, + ..MatureMergeConfig::default() + }; + let operations = plan_merge_operations_for_index( + &index_config_no_retention(), + splits, + now_well_after_recent_day(), + &config, + ); + + assert_eq!(operations.len(), 1); + assert_eq!(operations[0].splits.len(), 24); + } +} diff --git a/quickwit/quickwit-indexing/src/models/indexed_split.rs b/quickwit/quickwit-indexing/src/models/indexed_split.rs index e129feede9b..a622b241da9 100644 --- a/quickwit/quickwit-indexing/src/models/indexed_split.rs +++ b/quickwit/quickwit-indexing/src/models/indexed_split.rs @@ -105,12 +105,12 @@ impl IndexedSplitBuilder { partition_id, split_id, num_docs: 0, - replaced_split_ids: Vec::new(), uncompressed_docs_size_in_bytes: 0, time_range: None, secondary_time_range: None, delete_opstamp: last_delete_opstamp, num_merge_ops: 0, + replaced_splits: Vec::new(), }, index_writer, split_scratch_directory, diff --git a/quickwit/quickwit-indexing/src/models/mod.rs b/quickwit/quickwit-indexing/src/models/mod.rs index 9dfdfde1594..d1642791933 100644 --- a/quickwit/quickwit-indexing/src/models/mod.rs +++ b/quickwit/quickwit-indexing/src/models/mod.rs @@ -47,7 +47,7 @@ use quickwit_proto::types::PublishToken; pub use raw_doc_batch::RawDocBatch; pub(crate) use shard_positions::LocalShardPositionsUpdate; pub use shard_positions::ShardPositionsService; -pub use split_attrs::{SplitAttrs, create_split_metadata}; +pub use split_attrs::{ReplacedSplit, SplitAttrs, create_split_metadata}; #[derive(Debug)] pub struct NewPublishToken(pub PublishToken); diff --git a/quickwit/quickwit-indexing/src/models/publisher_message.rs b/quickwit/quickwit-indexing/src/models/publisher_message.rs index 13182a8f76a..e1ba9eb8ae2 100644 --- a/quickwit/quickwit-indexing/src/models/publisher_message.rs +++ b/quickwit/quickwit-indexing/src/models/publisher_message.rs @@ -22,11 +22,11 @@ use tracing::Span; use crate::merge_policy::MergeTask; use crate::models::PublishLock; +use crate::models::split_attrs::ReplacedSplit; pub struct SplitsUpdate { pub index_uid: IndexUid, pub new_splits: Vec, - pub replaced_split_ids: Vec, pub checkpoint_delta_opt: Option, pub publish_lock: PublishLock, pub publish_token_opt: Option, @@ -36,6 +36,7 @@ pub struct SplitsUpdate { /// If `None`, the split batch was built in the `IndexingPipeline`. pub merge_task: Option, pub parent_span: Span, + pub replaced_splits: Vec, } impl fmt::Debug for SplitsUpdate { diff --git a/quickwit/quickwit-indexing/src/models/split_attrs.rs b/quickwit/quickwit-indexing/src/models/split_attrs.rs index dde48fab25a..4a8076c4ed6 100644 --- a/quickwit/quickwit-indexing/src/models/split_attrs.rs +++ b/quickwit/quickwit-indexing/src/models/split_attrs.rs @@ -25,6 +25,14 @@ use time::OffsetDateTime; use crate::merge_policy::MergePolicy; +#[derive(PartialEq, Eq, Debug, Default, Clone)] +pub struct ReplacedSplit { + pub split_id: SplitId, + /// Snapshot of the split's soft-deletes. These will be consolidated into + /// the split during the merge. + pub soft_deleted_doc_ids: BTreeSet, +} + pub struct SplitAttrs { /// ID of the node that produced the split. pub node_id: NodeId, @@ -61,13 +69,13 @@ pub struct SplitAttrs { pub time_range: Option>, pub secondary_time_range: Option>, - pub replaced_split_ids: Vec, - /// Delete opstamp. pub delete_opstamp: u64, // Number of merge operation the split has been through so far. pub num_merge_ops: usize, + + pub replaced_splits: Vec, } impl fmt::Debug for SplitAttrs { @@ -75,7 +83,14 @@ impl fmt::Debug for SplitAttrs { f.debug_struct("SplitAttrs") .field("split_id", &self.split_id) .field("partition_id", &self.partition_id) - .field("replaced_split_ids", &self.replaced_split_ids) + .field( + "replaced_split_ids", + &self + .replaced_splits + .iter() + .map(|s| &s.split_id) + .collect::>(), + ) .field("time_range", &self.time_range) .field( "uncompressed_docs_size_in_bytes", @@ -137,6 +152,7 @@ pub fn create_split_metadata( footer_offsets, delete_opstamp: split_attrs.delete_opstamp, num_merge_ops: split_attrs.num_merge_ops, + soft_deleted_doc_ids: BTreeSet::new(), } } diff --git a/quickwit/quickwit-indexing/src/soft_delete_query.rs b/quickwit/quickwit-indexing/src/soft_delete_query.rs new file mode 100644 index 00000000000..fad5ed564b9 --- /dev/null +++ b/quickwit/quickwit-indexing/src/soft_delete_query.rs @@ -0,0 +1,377 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use tantivy::index::SegmentId; +use tantivy::query::{EmptyScorer, EnableScoring, Explanation, Query, Scorer, Weight}; +use tantivy::{DocId, DocSet, Score, SegmentReader, TERMINATED, TantivyError, Term}; + +/// A tantivy [`Query`] that matches specific doc IDs within their respective segments. +/// +/// Built from the `soft_deleted_doc_ids` fields of the input [`SplitMetadata`] structs, this +/// query is passed to [`IndexWriter::delete_query`] so that the matched documents are marked for +/// deletion and then physically removed during the subsequent tantivy merge. The query itself only +/// identifies which documents to remove; the actual deletion is performed by the caller. +#[derive(Clone, Debug)] +pub(crate) struct SoftDeletedDocIdsQuery { + /// Maps each segment ID to the **sorted** list of doc IDs to delete within that segment. + docs_per_segment: HashMap>, +} + +impl SoftDeletedDocIdsQuery { + pub(crate) fn new(docs_per_segment: HashMap>) -> Self { + Self { docs_per_segment } + } +} + +impl Query for SoftDeletedDocIdsQuery { + fn weight(&self, _: EnableScoring<'_>) -> tantivy::Result> { + Ok(Box::new(SoftDeletedDocIdsWeight { + docs_per_segment: self.docs_per_segment.clone(), + })) + } + + fn query_terms<'a>(&'a self, _visitor: &mut dyn FnMut(&'a Term, bool)) { + // Doc-ID–based query — no index terms to visit. + } +} + +/// Minimal `DocSet + Scorer` over a pre-sorted, deduplicated list of doc IDs. +/// +/// Starts positioned at the first document (no initial `advance()` call required). +struct SortedDocIdScorer { + doc_ids: Vec, + pos: usize, +} + +impl DocSet for SortedDocIdScorer { + fn advance(&mut self) -> DocId { + self.pos += 1; + self.doc() + } + + fn seek(&mut self, target: DocId) -> DocId { + // Binary-search to the first id >= target. + self.pos = self.doc_ids.partition_point(|&id| id < target); + self.doc() + } + + fn doc(&self) -> DocId { + self.doc_ids.get(self.pos).copied().unwrap_or(TERMINATED) + } + + fn size_hint(&self) -> u32 { + self.doc_ids.len().saturating_sub(self.pos) as u32 + } +} + +impl Scorer for SortedDocIdScorer { + fn score(&mut self) -> Score { + 1.0 + } +} + +struct SoftDeletedDocIdsWeight { + docs_per_segment: HashMap>, +} + +impl Weight for SoftDeletedDocIdsWeight { + fn scorer(&self, reader: &SegmentReader, _boost: Score) -> tantivy::Result> { + let Some(doc_ids) = self.docs_per_segment.get(&reader.segment_id()) else { + return Ok(Box::new(EmptyScorer)); + }; + // Filter defensively: doc IDs must be < max_doc. The BTreeSet source guarantees + // strict ascending order, which SortedDocIdScorer requires. + let doc_ids: Vec = doc_ids + .iter() + .copied() + .filter(|&id| id < reader.max_doc()) + .collect(); + if doc_ids.is_empty() { + return Ok(Box::new(EmptyScorer)); + } + Ok(Box::new(SortedDocIdScorer { doc_ids, pos: 0 })) + } + + fn explain(&self, reader: &SegmentReader, doc: DocId) -> tantivy::Result { + let is_deleted = self + .docs_per_segment + .get(&reader.segment_id()) + .map(|ids| ids.binary_search(&doc).is_ok()) + .unwrap_or(false); + if is_deleted { + Ok(Explanation::new("SoftDeletedDocIdsQuery", 1.0)) + } else { + Err(TantivyError::InvalidArgument(format!( + "Document #{doc} is not soft-deleted in this segment" + ))) + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use tantivy::collector::TopDocs; + use tantivy::index::SegmentId; + use tantivy::query::AllQuery; + use tantivy::schema::{STORED, Schema, TEXT, Value}; + use tantivy::{Index, IndexWriter, ReloadPolicy, TantivyDocument, doc}; + + use super::*; + + /// Build an in-RAM single-segment index where each entry in `texts` becomes + /// one stored document. All documents are committed in a single pass so + /// tantivy assigns them contiguous doc IDs starting at 0. + fn make_index(texts: &[&str]) -> tantivy::Result<(Index, tantivy::schema::Field)> { + let mut schema_builder = Schema::builder(); + let body = schema_builder.add_text_field("body", TEXT | STORED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut writer: IndexWriter = index.writer(15_000_000)?; + for text in texts { + writer.add_document(doc!(body => *text))?; + } + writer.commit()?; + Ok((index, body)) + } + + /// Apply `query` via `IndexWriter::delete_query`, commit, and return a + /// freshly-opened reader that reflects the resulting deletion state. + fn apply_delete_query( + index: &Index, + query: SoftDeletedDocIdsQuery, + ) -> tantivy::Result { + let mut writer: IndexWriter = index.writer(15_000_000)?; + writer.delete_query(Box::new(query))?; + writer.commit()?; + index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into() + } + + /// Collect and sort the stored body values of all live documents so that + /// tests can assert on the exact surviving content, independent of score + /// ordering. + fn live_bodies( + reader: &tantivy::IndexReader, + body: tantivy::schema::Field, + ) -> tantivy::Result> { + let searcher = reader.searcher(); + let top_docs = searcher.search(&AllQuery, &TopDocs::with_limit(1_000).order_by_score())?; + let mut texts: Vec = top_docs + .iter() + .map(|(_, addr)| { + let doc: TantivyDocument = searcher.doc(*addr).unwrap(); + doc.get_first(body) + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string() + }) + .collect(); + texts.sort(); + Ok(texts) + } + + #[test] + fn test_delete_query_removes_targeted_docs() -> tantivy::Result<()> { + let (index, _) = make_index(&["a", "b", "c", "d", "e"])?; + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + let seg_readers = searcher.segment_readers(); + assert_eq!( + seg_readers.len(), + 1, + "expected a single segment after one commit" + ); + let segment_id = seg_readers[0].segment_id(); + drop(searcher); + + // Target doc IDs 1 ("b") and 3 ("d"). + let query = SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![1u32, 3u32])])); + let reader_after = apply_delete_query(&index, query)?; + let searcher_after = reader_after.searcher(); + let seg = &searcher_after.segment_readers()[0]; + + assert_eq!(seg.num_docs(), 3, "exactly 3 docs must survive"); + Ok(()) + } + + #[test] + fn test_delete_query_leaves_correct_docs_alive() -> tantivy::Result<()> { + let (index, body) = make_index(&["a", "b", "c", "d", "e"])?; + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let segment_id = { + let searcher = reader.searcher(); + searcher.segment_readers()[0].segment_id() + }; + + // Delete docs 1 ("b") and 3 ("d"); "a", "c", "e" must survive. + let query = SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![1u32, 3u32])])); + let reader_after = apply_delete_query(&index, query)?; + + let surviving = live_bodies(&reader_after, body)?; + assert_eq!(surviving, vec!["a", "c", "e"]); + Ok(()) + } + + #[test] + fn test_delete_query_removes_all_docs() -> tantivy::Result<()> { + let (index, _) = make_index(&["x", "y", "z"])?; + let segment_id = { + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + searcher.segment_readers()[0].segment_id() + }; + + let query = + SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![0u32, 1u32, 2u32])])); + let reader_after = apply_delete_query(&index, query)?; + let searcher_after = reader_after.searcher(); + + let total_live_docs: u32 = searcher_after + .segment_readers() + .iter() + .map(|r| r.num_docs()) + .sum(); + assert_eq!(total_live_docs, 0, "all docs must be deleted"); + Ok(()) + } + + #[test] + fn test_delete_query_boundary_doc_ids() -> tantivy::Result<()> { + // Deleting the very first (0) and very last (3) doc IDs exercises the boundary + // positions of SortedDocIdScorer. + let (index, body) = make_index(&["a", "b", "c", "d"])?; + let segment_id = { + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + searcher.segment_readers()[0].segment_id() + }; + + let query = SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![0u32, 3u32])])); + let reader_after = apply_delete_query(&index, query)?; + + let surviving = live_bodies(&reader_after, body)?; + assert_eq!(surviving, vec!["b", "c"]); + Ok(()) + } + + #[test] + fn test_delete_query_single_doc() -> tantivy::Result<()> { + let (index, body) = make_index(&["keep", "remove", "keep-too"])?; + let segment_id = { + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + searcher.segment_readers()[0].segment_id() + }; + + let query = SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![1u32])])); + let reader_after = apply_delete_query(&index, query)?; + + let surviving = live_bodies(&reader_after, body)?; + assert_eq!(surviving, vec!["keep", "keep-too"]); + Ok(()) + } + + #[test] + fn test_delete_query_unknown_segment_id_has_no_effect() -> tantivy::Result<()> { + let (index, _) = make_index(&["a", "b", "c"])?; + + // Obtain a segment ID that definitely does not belong to `index` by + // creating an independent second index. + let (other_index, _) = make_index(&["z"])?; + let foreign_id: SegmentId = { + let other_reader: tantivy::IndexReader = other_index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let other_searcher = other_reader.searcher(); + other_searcher.segment_readers()[0].segment_id() + }; + + // Targeting all three doc IDs under the foreign segment must not delete anything. + let query = + SoftDeletedDocIdsQuery::new(HashMap::from([(foreign_id, vec![0u32, 1u32, 2u32])])); + let reader_after = apply_delete_query(&index, query)?; + let searcher_after = reader_after.searcher(); + + assert_eq!( + searcher_after.segment_readers()[0].num_docs(), + 3, + "unknown segment ID must leave all docs intact" + ); + Ok(()) + } + + #[test] + fn test_delete_query_out_of_range_doc_ids_are_ignored() -> tantivy::Result<()> { + // The index has 2 docs (max_doc = 2, valid IDs are 0 and 1). + // Providing only out-of-range IDs must not delete anything. + let (index, _) = make_index(&["a", "b"])?; + let segment_id = { + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + searcher.segment_readers()[0].segment_id() + }; + + let query = + SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![10u32, 20u32, 100u32])])); + let reader_after = apply_delete_query(&index, query)?; + let searcher_after = reader_after.searcher(); + + assert_eq!( + searcher_after.segment_readers()[0].num_docs(), + 2, + "out-of-range doc IDs must be silently ignored" + ); + Ok(()) + } + + #[test] + fn test_delete_query_empty_map_has_no_effect() -> tantivy::Result<()> { + let (index, _) = make_index(&["a", "b", "c"])?; + let query = SoftDeletedDocIdsQuery::new(HashMap::new()); + let reader_after = apply_delete_query(&index, query)?; + let searcher_after = reader_after.searcher(); + + assert_eq!( + searcher_after.segment_readers()[0].num_docs(), + 3, + "empty docs-per-segment map must delete nothing" + ); + Ok(()) + } +} diff --git a/quickwit/quickwit-indexing/src/source/ingest/mod.rs b/quickwit/quickwit-indexing/src/source/ingest/mod.rs index d9e21affb87..63c746aabe0 100644 --- a/quickwit/quickwit-indexing/src/source/ingest/mod.rs +++ b/quickwit/quickwit-indexing/src/source/ingest/mod.rs @@ -410,9 +410,8 @@ impl IngestSource { .assigned_shards .keys() .filter(|&shard_id| !new_assigned_shard_ids.contains(shard_id)) - .cloned() .any(|removed_shard_id| { - let Some(assigned_shard) = self.assigned_shards.get(&removed_shard_id) else { + let Some(assigned_shard) = self.assigned_shards.get(removed_shard_id) else { return false; }; assigned_shard.status != IndexingStatus::Complete diff --git a/quickwit/quickwit-indexing/src/source/kafka_source.rs b/quickwit/quickwit-indexing/src/source/kafka_source.rs index 5f93d0a9344..f1aca45bb98 100644 --- a/quickwit/quickwit-indexing/src/source/kafka_source.rs +++ b/quickwit/quickwit-indexing/src/source/kafka_source.rs @@ -25,7 +25,7 @@ use quickwit_actors::{ActorExitStatus, Mailbox}; use quickwit_config::KafkaSourceParams; use quickwit_metastore::checkpoint::{PartitionId, SourceCheckpoint}; use quickwit_proto::metastore::SourceType; -use quickwit_proto::types::{IndexUid, Position}; +use quickwit_proto::types::{IndexUid, NodeIdRef, Position}; use rdkafka::config::{ClientConfig, RDKafkaLogLevel}; use rdkafka::consumer::{ BaseConsumer, CommitMode, Consumer, ConsumerContext, DefaultConsumerContext, Rebalance, @@ -240,6 +240,7 @@ impl KafkaSource { let (events_tx, events_rx) = mpsc::channel(100); let (truncate_tx, truncate_rx) = watch::channel(SourceCheckpoint::default()); let (client_config, consumer, group_id) = create_consumer( + source_runtime.node_id(), source_runtime.index_uid(), source_runtime.source_id(), source_params, @@ -654,6 +655,7 @@ pub(super) async fn check_connectivity(params: KafkaSourceParams) -> anyhow::Res /// Creates a new `KafkaSourceConsumer`. fn create_consumer( + node_id: &NodeIdRef, index_uid: &IndexUid, source_id: &str, params: KafkaSourceParams, @@ -676,6 +678,7 @@ fn create_consumer( params.enable_backfill_mode.to_string(), ) .set("group.id", &group_id) + .set("client.id", node_id.as_str()) .set_log_level(log_level) .create_with_context(RdKafkaContext { topic: params.topic, diff --git a/quickwit/quickwit-integration-tests/src/tests/update_tests/mod.rs b/quickwit/quickwit-integration-tests/src/tests/update_tests/mod.rs index ac9e24e517d..5e1b1ec08da 100644 --- a/quickwit/quickwit-integration-tests/src/tests/update_tests/mod.rs +++ b/quickwit/quickwit-integration-tests/src/tests/update_tests/mod.rs @@ -38,7 +38,6 @@ async fn assert_hits_unordered( .await; if let Ok(expected_hits) = expected_result { let resp = search_res.unwrap_or_else(|err| panic!("query: {query}, error: {err}")); - assert_eq!(resp.errors.len(), 0, "query: {query}"); assert_eq!(resp.num_hits, expected_hits.len() as u64, "query: {query}"); for expected_hit in expected_hits { assert!( @@ -49,8 +48,8 @@ async fn assert_hits_unordered( resp.hits ); } - } else if let Ok(search_response) = search_res { - assert!(!search_response.errors.is_empty(), "query: {query}"); + } else { + search_res.unwrap_err(); } } diff --git a/quickwit/quickwit-jaeger/src/lib.rs b/quickwit/quickwit-jaeger/src/lib.rs index 1b6dfc27d0c..cdf8c8b11f9 100644 --- a/quickwit/quickwit-jaeger/src/lib.rs +++ b/quickwit/quickwit-jaeger/src/lib.rs @@ -2718,7 +2718,6 @@ mod tests { num_hits: 2, hits: vec![], elapsed_time_micros: 100, - errors: Vec::new(), aggregation_postcard: Some(aggregation_postcard), scroll_id: None, failed_splits: Vec::new(), diff --git a/quickwit/quickwit-metastore/src/metastore/control_plane_metastore.rs b/quickwit/quickwit-metastore/src/metastore/control_plane_metastore.rs index bcb07d79020..0c05f982e31 100644 --- a/quickwit/quickwit-metastore/src/metastore/control_plane_metastore.rs +++ b/quickwit/quickwit-metastore/src/metastore/control_plane_metastore.rs @@ -20,20 +20,21 @@ use quickwit_proto::control_plane::{ControlPlaneService, ControlPlaneServiceClie use quickwit_proto::metastore::{ AcquireShardsRequest, AcquireShardsResponse, AddSourceRequest, CreateIndexRequest, CreateIndexResponse, CreateIndexTemplateRequest, DeleteIndexRequest, - DeleteIndexTemplatesRequest, DeleteQuery, DeleteShardsRequest, DeleteShardsResponse, - DeleteSourceRequest, DeleteSplitsRequest, DeleteTask, EmptyResponse, + DeleteIndexTemplatesRequest, DeleteKvRequest, DeleteQuery, DeleteShardsRequest, + DeleteShardsResponse, DeleteSourceRequest, DeleteSplitsRequest, DeleteTask, EmptyResponse, FindIndexTemplateMatchesRequest, FindIndexTemplateMatchesResponse, GetClusterIdentityRequest, - GetClusterIdentityResponse, GetIndexTemplateRequest, GetIndexTemplateResponse, - IndexMetadataRequest, IndexMetadataResponse, IndexesMetadataRequest, IndexesMetadataResponse, - LastDeleteOpstampRequest, LastDeleteOpstampResponse, ListDeleteTasksRequest, - ListDeleteTasksResponse, ListIndexStatsRequest, ListIndexStatsResponse, + GetClusterIdentityResponse, GetIndexTemplateRequest, GetIndexTemplateResponse, GetKvRequest, + GetKvResponse, IndexMetadataRequest, IndexMetadataResponse, IndexesMetadataRequest, + IndexesMetadataResponse, LastDeleteOpstampRequest, LastDeleteOpstampResponse, + ListDeleteTasksRequest, ListDeleteTasksResponse, ListIndexStatsRequest, ListIndexStatsResponse, ListIndexTemplatesRequest, ListIndexTemplatesResponse, ListIndexesMetadataRequest, ListIndexesMetadataResponse, ListShardsRequest, ListShardsResponse, ListSplitsRequest, ListSplitsResponse, ListStaleSplitsRequest, MarkSplitsForDeletionRequest, MetastoreResult, MetastoreService, MetastoreServiceClient, MetastoreServiceStream, OpenShardsRequest, OpenShardsResponse, PruneShardsRequest, PublishSplitsRequest, ResetSourceCheckpointRequest, - StageSplitsRequest, ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, - UpdateSplitsDeleteOpstampRequest, UpdateSplitsDeleteOpstampResponse, + SetKvRequest, SoftDeleteDocumentsRequest, SoftDeleteDocumentsResponse, StageSplitsRequest, + ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, UpdateSplitsDeleteOpstampRequest, + UpdateSplitsDeleteOpstampResponse, }; /// A [`MetastoreService`] implementation that proxies some requests to the control plane so it can @@ -188,6 +189,13 @@ impl MetastoreService for ControlPlaneMetastore { self.metastore.delete_splits(request).await } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> MetastoreResult { + self.metastore.soft_delete_documents(request).await + } + async fn reset_source_checkpoint( &self, request: ResetSourceCheckpointRequest, @@ -289,4 +297,16 @@ impl MetastoreService for ControlPlaneMetastore { ) -> MetastoreResult { self.metastore.get_cluster_identity(request).await } + + async fn get_kv(&self, request: GetKvRequest) -> MetastoreResult { + self.metastore.get_kv(request).await + } + + async fn set_kv(&self, request: SetKvRequest) -> MetastoreResult { + self.metastore.set_kv(request).await + } + + async fn delete_kv(&self, request: DeleteKvRequest) -> MetastoreResult { + self.metastore.delete_kv(request).await + } } diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs index 4b53cbf648b..bd1677e89fd 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs @@ -32,7 +32,7 @@ use quickwit_proto::metastore::{ AcquireShardsRequest, AcquireShardsResponse, DeleteQuery, DeleteShardsRequest, DeleteShardsResponse, DeleteTask, EntityKind, IndexStats, ListShardsSubrequest, ListShardsSubresponse, MetastoreError, MetastoreResult, OpenShardSubrequest, - OpenShardSubresponse, PruneShardsRequest, SplitStats, + OpenShardSubresponse, PruneShardsRequest, SplitDocIds, SplitStats, }; use quickwit_proto::types::{IndexUid, PublishToken, SourceId, SplitId}; use serde::{Deserialize, Serialize}; @@ -43,7 +43,7 @@ use tracing::{info, warn}; use super::MutationOccurred; use crate::checkpoint::IndexCheckpointDelta; -use crate::metastore::{SortBy, use_shard_api}; +use crate::metastore::{MAX_SOFT_DELETED_DOCS_PER_SPLIT, SortBy, use_shard_api}; use crate::{IndexMetadata, ListSplitsQuery, Split, SplitMetadata, SplitState, split_tag_filter}; /// A `FileBackedIndex` object carries an index metadata and its split metadata. @@ -498,6 +498,63 @@ impl FileBackedIndex { Ok(()) } + /// Soft-deletes individual documents within published splits. + pub(crate) fn soft_delete_documents( + &mut self, + split_doc_ids: &[SplitDocIds], + ) -> MetastoreResult { + // First pass: validate all splits before making any changes to guarantee atomicity. + for entry in split_doc_ids { + let split = self.splits.get(&entry.split_id).ok_or_else(|| { + MetastoreError::NotFound(EntityKind::Split { + split_id: entry.split_id.clone(), + }) + })?; + if split.split_state != SplitState::Published { + return Err(MetastoreError::FailedPrecondition { + entity: EntityKind::Split { + split_id: entry.split_id.clone(), + }, + message: format!("split `{}` is not in Published state", entry.split_id), + }); + } + let current_count = split.split_metadata.soft_deleted_doc_ids.len(); + let new_unique_count = entry + .doc_ids + .iter() + .filter(|&&id| !split.split_metadata.soft_deleted_doc_ids.contains(&id)) + .count(); + if current_count + new_unique_count > MAX_SOFT_DELETED_DOCS_PER_SPLIT { + return Err(MetastoreError::FailedPrecondition { + entity: EntityKind::Split { + split_id: entry.split_id.clone(), + }, + message: format!( + "split `{}` would exceed the maximum number of soft-deleted documents \ + ({MAX_SOFT_DELETED_DOCS_PER_SPLIT}): current={current_count}, would be={}", + entry.split_id, + current_count + new_unique_count, + ), + }); + } + } + + // Second pass: all splits are valid — apply changes. + let mut num_soft_deleted = 0u64; + for entry in split_doc_ids { + let split = self + .splits + .get_mut(&entry.split_id) + .expect("split existence validated in first pass"); + for &doc_id in &entry.doc_ids { + if split.split_metadata.soft_deleted_doc_ids.insert(doc_id) { + num_soft_deleted += 1; + } + } + } + Ok(num_soft_deleted) + } + /// Gets IndexStats for this index pub(crate) fn get_stats(&self) -> MetastoreResult { let mut staged_stats = SplitStats::default(); @@ -724,6 +781,11 @@ impl Debug for Stamper { } fn split_query_predicate(split: &&Split, query: &ListSplitsQuery) -> bool { + if let Some(split_ids) = &query.split_ids + && !split_ids.contains(&split.split_metadata.split_id) + { + return false; + } if !split_tag_filter(&split.split_metadata, query.tags.as_ref()) { return false; } @@ -814,11 +876,14 @@ mod tests { use quickwit_doc_mapper::tag_pruning::TagFilterAst; use quickwit_proto::ingest::Shard; - use quickwit_proto::metastore::{ListShardsSubrequest, SplitStats}; + use quickwit_proto::metastore::{ + EntityKind, ListShardsSubrequest, MetastoreError, SplitDocIds, SplitStats, + }; use quickwit_proto::types::{IndexUid, SourceId}; use super::FileBackedIndex; use crate::file_backed::file_backed_index::split_query_predicate; + use crate::metastore::MAX_SOFT_DELETED_DOCS_PER_SPLIT; use crate::{IndexMetadata, ListSplitsQuery, Split, SplitMetadata, SplitState}; impl FileBackedIndex { @@ -949,6 +1014,15 @@ mod tests { assert!(split_query_predicate(&&split_1, &query)); assert!(split_query_predicate(&&split_2, &query)); assert!(!split_query_predicate(&&split_3, &query)); + + let query = ListSplitsQuery::for_index(IndexUid::new_with_random_ulid("test-index")) + .with_split_ids(vec![ + split_1.split_metadata.split_id.clone(), + split_2.split_metadata.split_id.clone(), + ]); + assert!(split_query_predicate(&&split_1, &query)); + assert!(split_query_predicate(&&split_2, &query)); + assert!(!split_query_predicate(&&split_3, &query)); } #[test] @@ -1019,4 +1093,151 @@ mod tests { assert_eq!(stats.published, expected_published); assert_eq!(stats.marked_for_deletion, expected_marked_for_deletion); } + + /// Helper: creates a `FileBackedIndex` with a single published split. + fn make_index_with_published_split(split_id: &str) -> FileBackedIndex { + let index_metadata = + IndexMetadata::for_test("test-index", "file:///qwdata/indexes/test-index"); + let mut index = FileBackedIndex::new(index_metadata, Vec::new(), HashMap::new(), vec![]); + let split_metadata = SplitMetadata { + split_id: split_id.to_string(), + ..Default::default() + }; + index.stage_split(split_metadata).unwrap(); + index + .publish_splits([split_id], Vec::<&str>::new(), None, None) + .unwrap(); + index + } + + #[test] + fn test_soft_delete_documents_basic() { + let mut index = make_index_with_published_split("split-a"); + let split_doc_ids = vec![SplitDocIds { + split_id: "split-a".to_string(), + doc_ids: vec![1, 5, 42], + }]; + let num_deleted = index.soft_delete_documents(&split_doc_ids).unwrap(); + assert_eq!(num_deleted, 3); + + let split = index.splits.get("split-a").unwrap(); + assert_eq!( + split.split_metadata.soft_deleted_doc_ids, + BTreeSet::from([1, 5, 42]) + ); + } + + #[test] + fn test_soft_delete_documents_idempotent() { + let mut index = make_index_with_published_split("split-a"); + + // First call: delete doc IDs 1, 2, 3. + let split_doc_ids = vec![SplitDocIds { + split_id: "split-a".to_string(), + doc_ids: vec![1, 2, 3], + }]; + let num_deleted = index.soft_delete_documents(&split_doc_ids).unwrap(); + assert_eq!(num_deleted, 3); + + // Second call: same IDs plus one new one. + let split_doc_ids = vec![SplitDocIds { + split_id: "split-a".to_string(), + doc_ids: vec![1, 2, 3, 4], + }]; + let num_deleted = index.soft_delete_documents(&split_doc_ids).unwrap(); + // Only doc_id 4 is new. + assert_eq!(num_deleted, 1); + + let split = index.splits.get("split-a").unwrap(); + assert_eq!( + split.split_metadata.soft_deleted_doc_ids, + BTreeSet::from([1, 2, 3, 4]) + ); + } + + #[test] + fn test_soft_delete_documents_non_published_split_fails() { + let index_metadata = + IndexMetadata::for_test("test-index", "file:///qwdata/indexes/test-index"); + let mut index = FileBackedIndex::new(index_metadata, Vec::new(), HashMap::new(), vec![]); + let split_metadata = SplitMetadata { + split_id: "staged-split".to_string(), + ..Default::default() + }; + index.stage_split(split_metadata).unwrap(); + // The split is still in Staged state — not Published. + + let split_doc_ids = vec![SplitDocIds { + split_id: "staged-split".to_string(), + doc_ids: vec![10], + }]; + let error = index.soft_delete_documents(&split_doc_ids).unwrap_err(); + assert!( + matches!( + error, + MetastoreError::FailedPrecondition { + entity: EntityKind::Split { .. }, + .. + } + ), + "expected FailedPrecondition error, got: {error:?}" + ); + } + + #[test] + fn test_soft_delete_documents_unknown_split_fails() { + let index_metadata = + IndexMetadata::for_test("test-index", "file:///qwdata/indexes/test-index"); + let mut index = FileBackedIndex::new(index_metadata, Vec::new(), HashMap::new(), vec![]); + + let split_doc_ids = vec![SplitDocIds { + split_id: "nonexistent-split".to_string(), + doc_ids: vec![1], + }]; + let error = index.soft_delete_documents(&split_doc_ids).unwrap_err(); + assert!( + matches!(error, MetastoreError::NotFound(EntityKind::Split { .. })), + "expected NotFound error, got: {error:?}" + ); + } + + #[test] + fn test_soft_delete_documents_limit_exceeded() { + let mut index = make_index_with_published_split("split-a"); + + // Pre-populate with MAX_SOFT_DELETED_DOCS_PER_SPLIT - 1 soft-deleted doc IDs. + let initial_ids: Vec = (0..MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32 - 1).collect(); + let initial_entries = vec![SplitDocIds { + split_id: "split-a".to_string(), + doc_ids: initial_ids, + }]; + index.soft_delete_documents(&initial_entries).unwrap(); + + // Adding 2 more unique IDs would push the total to MAX + 1 — must fail. + let overflow_entries = vec![SplitDocIds { + split_id: "split-a".to_string(), + doc_ids: vec![ + MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32 - 1, + MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32, + ], + }]; + let error = index.soft_delete_documents(&overflow_entries).unwrap_err(); + assert!( + matches!( + error, + MetastoreError::FailedPrecondition { + entity: EntityKind::Split { .. }, + .. + } + ), + "expected FailedPrecondition error when limit exceeded, got: {error:?}" + ); + + // The split must be unchanged — still at MAX - 1 entries. + let split = index.splits.get("split-a").unwrap(); + assert_eq!( + split.split_metadata.soft_deleted_doc_ids.len(), + MAX_SOFT_DELETED_DOCS_PER_SPLIT - 1 + ); + } } diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/manifest.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/manifest.rs index cec811bd3e4..4e9bf42c972 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/manifest.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/manifest.rs @@ -42,6 +42,7 @@ impl LegacyManifest { indexes: self.indexes, templates: HashMap::new(), identity: Uuid::nil(), + kv_store: HashMap::new(), } } } @@ -67,6 +68,7 @@ pub(crate) struct Manifest { // unnecessary here and we can pass the hash map as is to the `MetastoreState` pub templates: HashMap, pub identity: Uuid, + pub kv_store: HashMap, } #[derive(Clone, Debug, Serialize, Deserialize)] @@ -108,6 +110,8 @@ struct ManifestV0_8 { templates: Vec, #[serde(default, skip_serializing_if = "Uuid::is_nil")] identity: Uuid, + #[serde(default, skip_serializing_if = "HashMap::is_empty")] + kv_store: HashMap, } impl From for ManifestV0_8 { @@ -121,6 +125,7 @@ impl From for ManifestV0_8 { indexes: manifest.indexes, templates, identity: manifest.identity, + kv_store: manifest.kv_store, } } } @@ -137,6 +142,7 @@ impl From for Manifest { indexes, templates, identity: manifest.identity, + kv_store: manifest.kv_store, } } } @@ -158,12 +164,14 @@ impl quickwit_config::TestableForRegression for Manifest { indexes, templates, identity: Uuid::nil(), + kv_store: HashMap::new(), } } fn assert_equality(&self, other: &Self) { assert_eq!(self.indexes, other.indexes); assert_eq!(self.templates, other.templates); + assert_eq!(self.kv_store, other.kv_store); } } @@ -338,6 +346,7 @@ mod tests { indexes, templates, identity: Uuid::nil(), + kv_store: HashMap::new(), }; let manifest_json = serde_json::to_string_pretty(&manifest).unwrap(); let manifest_deserialized: Manifest = serde_json::from_str(&manifest_json).unwrap(); diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs index 2542f1db36f..51dcbc424a8 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs @@ -42,12 +42,13 @@ use quickwit_config::IndexTemplate; use quickwit_proto::metastore::{ AcquireShardsRequest, AcquireShardsResponse, AddSourceRequest, CreateIndexRequest, CreateIndexResponse, CreateIndexTemplateRequest, DeleteIndexRequest, - DeleteIndexTemplatesRequest, DeleteQuery, DeleteShardsRequest, DeleteShardsResponse, - DeleteSourceRequest, DeleteSplitsRequest, DeleteTask, EmptyResponse, EntityKind, - FindIndexTemplateMatchesRequest, FindIndexTemplateMatchesResponse, GetClusterIdentityRequest, - GetClusterIdentityResponse, GetIndexTemplateRequest, GetIndexTemplateResponse, - IndexMetadataFailure, IndexMetadataFailureReason, IndexMetadataRequest, IndexMetadataResponse, - IndexTemplateMatch, IndexesMetadataRequest, IndexesMetadataResponse, LastDeleteOpstampRequest, + DeleteIndexTemplatesRequest, DeleteKvRequest, DeleteQuery, DeleteShardsRequest, + DeleteShardsResponse, DeleteSourceRequest, DeleteSplitsRequest, DeleteTask, EmptyResponse, + EntityKind, FindIndexTemplateMatchesRequest, FindIndexTemplateMatchesResponse, + GetClusterIdentityRequest, GetClusterIdentityResponse, GetIndexTemplateRequest, + GetIndexTemplateResponse, GetKvRequest, GetKvResponse, IndexMetadataFailure, + IndexMetadataFailureReason, IndexMetadataRequest, IndexMetadataResponse, IndexTemplateMatch, + IndexesMetadataRequest, IndexesMetadataResponse, LastDeleteOpstampRequest, LastDeleteOpstampResponse, ListDeleteTasksRequest, ListDeleteTasksResponse, ListIndexStatsRequest, ListIndexStatsResponse, ListIndexTemplatesRequest, ListIndexTemplatesResponse, ListIndexesMetadataRequest, ListIndexesMetadataResponse, @@ -55,8 +56,9 @@ use quickwit_proto::metastore::{ ListStaleSplitsRequest, MarkSplitsForDeletionRequest, MetastoreError, MetastoreResult, MetastoreService, MetastoreServiceStream, OpenShardSubrequest, OpenShardsRequest, OpenShardsResponse, PruneShardsRequest, PublishSplitsRequest, ResetSourceCheckpointRequest, - StageSplitsRequest, ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, - UpdateSplitsDeleteOpstampRequest, UpdateSplitsDeleteOpstampResponse, serde_utils, + SetKvRequest, SoftDeleteDocumentsRequest, SoftDeleteDocumentsResponse, StageSplitsRequest, + ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, UpdateSplitsDeleteOpstampRequest, + UpdateSplitsDeleteOpstampResponse, serde_utils, }; use quickwit_proto::types::{IndexId, IndexUid}; use quickwit_storage::Storage; @@ -729,6 +731,23 @@ impl MetastoreService for FileBackedMetastore { Ok(EmptyResponse {}) } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> MetastoreResult { + let index_uid = request.index_uid().clone(); + let num_soft_deleted_doc_ids = self + .mutate(&index_uid, |index| { + let num_soft_deleted_doc_ids = + index.soft_delete_documents(&request.split_doc_ids)?; + Ok(MutationOccurred::Yes(num_soft_deleted_doc_ids)) + }) + .await?; + Ok(SoftDeleteDocumentsResponse { + num_soft_deleted_doc_ids, + }) + } + async fn add_source(&self, request: AddSourceRequest) -> MetastoreResult { let source_config = request.deserialize_source_config()?; let index_uid = request.index_uid(); @@ -1275,6 +1294,43 @@ impl MetastoreService for FileBackedMetastore { uuid: state_wlock_guard.identity.hyphenated().to_string(), }) } + + // KV store API + + async fn get_kv(&self, request: GetKvRequest) -> MetastoreResult { + let state = self.state.read().await; + let value = state.kv_store.get(&request.key).cloned(); + Ok(GetKvResponse { value }) + } + + async fn set_kv(&self, request: SetKvRequest) -> MetastoreResult { + let mut state = self.state.write().await; + let previous_value = state.kv_store.insert(request.key.clone(), request.value); + let manifest = state.as_manifest(); + if let Err(error) = save_manifest(&*self.storage, &manifest).await { + // Rollback + match previous_value { + Some(value) => state.kv_store.insert(request.key, value), + None => state.kv_store.remove(&request.key), + }; + return Err(error); + } + Ok(EmptyResponse {}) + } + + async fn delete_kv(&self, request: DeleteKvRequest) -> MetastoreResult { + let mut state = self.state.write().await; + let previous_value = state.kv_store.remove(&request.key); + let manifest = state.as_manifest(); + if let Err(error) = save_manifest(&*self.storage, &manifest).await { + // Rollback + if let Some(value) = previous_value { + state.kv_store.insert(request.key, value); + } + return Err(error); + } + Ok(EmptyResponse {}) + } } impl MetastoreServiceExt for FileBackedMetastore {} diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/state.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/state.rs index 0d42408f430..04235e861cf 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/state.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/state.rs @@ -33,6 +33,7 @@ pub(super) struct MetastoreState { pub templates: HashMap, pub template_matcher: IndexTemplateMatcher, pub identity: Uuid, + pub kv_store: HashMap, } impl MetastoreState { @@ -67,6 +68,7 @@ impl MetastoreState { templates: manifest.templates, template_matcher, identity: manifest.identity, + kv_store: manifest.kv_store, }; Ok(state) } @@ -89,6 +91,7 @@ impl MetastoreState { indexes, templates, identity: self.identity, + kv_store: self.kv_store.clone(), } } } diff --git a/quickwit/quickwit-metastore/src/metastore/mod.rs b/quickwit/quickwit-metastore/src/metastore/mod.rs index 98f2f1d5039..187ad1676d9 100644 --- a/quickwit/quickwit-metastore/src/metastore/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/mod.rs @@ -49,6 +49,10 @@ use crate::{Split, SplitMetadata, SplitState}; /// Splits batch size returned by the stream splits API pub(crate) const STREAM_SPLITS_CHUNK_SIZE: usize = 100; +/// Maximum number of soft-deleted document IDs allowed per split. +/// Attempts to soft-delete documents that would push the total above this limit will fail. +pub(crate) const MAX_SOFT_DELETED_DOCS_PER_SPLIT: usize = 10_000; + /// An extended trait for [`MetastoreService`]. #[async_trait] pub trait MetastoreServiceExt: MetastoreService { @@ -640,6 +644,10 @@ pub struct ListSplitsQuery { /// A specific node ID to filter by. pub node_id: Option, + /// A non-empty list of split IDs to fetch, or + /// None to ignore this filter. + pub split_ids: Option>, + /// The maximum number of splits to retrieve. pub limit: Option, @@ -739,6 +747,7 @@ impl ListSplitsQuery { mature: Bound::Unbounded, sort_by: SortBy::None, after_split: None, + split_ids: None, } } @@ -765,6 +774,7 @@ impl ListSplitsQuery { mature: Bound::Unbounded, sort_by: SortBy::None, after_split: None, + split_ids: None, }) } @@ -787,6 +797,7 @@ impl ListSplitsQuery { mature: Bound::Unbounded, sort_by: SortBy::None, after_split: None, + split_ids: None, } } @@ -796,6 +807,12 @@ impl ListSplitsQuery { self } + /// Selects only splits with the specified IDs. + pub fn with_split_ids(mut self, split_ids: Vec) -> Self { + self.split_ids = Some(split_ids); + self + } + /// Sets the maximum number of splits to retrieve. pub fn with_limit(mut self, n: usize) -> Self { self.limit = Some(n); diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs index d4296fb7ee6..a2129b8623a 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::fmt::{self, Write}; use std::str::FromStr; use std::time::Duration; @@ -30,21 +30,23 @@ use quickwit_proto::ingest::{Shard, ShardState}; use quickwit_proto::metastore::{ AcquireShardsRequest, AcquireShardsResponse, AddSourceRequest, CreateIndexRequest, CreateIndexResponse, CreateIndexTemplateRequest, DeleteIndexRequest, - DeleteIndexTemplatesRequest, DeleteQuery, DeleteShardsRequest, DeleteShardsResponse, - DeleteSourceRequest, DeleteSplitsRequest, DeleteTask, EmptyResponse, EntityKind, - FindIndexTemplateMatchesRequest, FindIndexTemplateMatchesResponse, GetClusterIdentityRequest, - GetClusterIdentityResponse, GetIndexTemplateRequest, GetIndexTemplateResponse, - IndexMetadataFailure, IndexMetadataFailureReason, IndexMetadataRequest, IndexMetadataResponse, - IndexStats, IndexTemplateMatch, IndexesMetadataRequest, IndexesMetadataResponse, - LastDeleteOpstampRequest, LastDeleteOpstampResponse, ListDeleteTasksRequest, - ListDeleteTasksResponse, ListIndexStatsRequest, ListIndexStatsResponse, - ListIndexTemplatesRequest, ListIndexTemplatesResponse, ListIndexesMetadataRequest, - ListIndexesMetadataResponse, ListShardsRequest, ListShardsResponse, ListShardsSubresponse, - ListSplitsRequest, ListSplitsResponse, ListStaleSplitsRequest, MarkSplitsForDeletionRequest, - MetastoreError, MetastoreResult, MetastoreService, MetastoreServiceStream, OpenShardSubrequest, + DeleteIndexTemplatesRequest, DeleteKvRequest, DeleteQuery, DeleteShardsRequest, + DeleteShardsResponse, DeleteSourceRequest, DeleteSplitsRequest, DeleteTask, EmptyResponse, + EntityKind, FindIndexTemplateMatchesRequest, FindIndexTemplateMatchesResponse, + GetClusterIdentityRequest, GetClusterIdentityResponse, GetIndexTemplateRequest, + GetIndexTemplateResponse, GetKvRequest, GetKvResponse, IndexMetadataFailure, + IndexMetadataFailureReason, IndexMetadataRequest, IndexMetadataResponse, IndexStats, + IndexTemplateMatch, IndexesMetadataRequest, IndexesMetadataResponse, LastDeleteOpstampRequest, + LastDeleteOpstampResponse, ListDeleteTasksRequest, ListDeleteTasksResponse, + ListIndexStatsRequest, ListIndexStatsResponse, ListIndexTemplatesRequest, + ListIndexTemplatesResponse, ListIndexesMetadataRequest, ListIndexesMetadataResponse, + ListShardsRequest, ListShardsResponse, ListShardsSubresponse, ListSplitsRequest, + ListSplitsResponse, ListStaleSplitsRequest, MarkSplitsForDeletionRequest, MetastoreError, + MetastoreResult, MetastoreService, MetastoreServiceStream, OpenShardSubrequest, OpenShardSubresponse, OpenShardsRequest, OpenShardsResponse, PruneShardsRequest, - PublishSplitsRequest, ResetSourceCheckpointRequest, SplitStats, StageSplitsRequest, - ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, UpdateSplitsDeleteOpstampRequest, + PublishSplitsRequest, ResetSourceCheckpointRequest, SetKvRequest, SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, SplitStats, StageSplitsRequest, ToggleSourceRequest, + UpdateIndexRequest, UpdateSourceRequest, UpdateSplitsDeleteOpstampRequest, UpdateSplitsDeleteOpstampResponse, serde_utils, }; use quickwit_proto::types::{IndexId, IndexUid, Position, PublishToken, ShardId, SourceId}; @@ -72,13 +74,14 @@ use crate::file_backed::MutationOccurred; use crate::metastore::postgres::model::Shards; use crate::metastore::postgres::utils::split_maturity_timestamp; use crate::metastore::{ - IndexesMetadataResponseExt, PublishSplitsRequestExt, STREAM_SPLITS_CHUNK_SIZE, - UpdateSourceRequestExt, use_shard_api, + IndexesMetadataResponseExt, MAX_SOFT_DELETED_DOCS_PER_SPLIT, PublishSplitsRequestExt, + STREAM_SPLITS_CHUNK_SIZE, UpdateSourceRequestExt, use_shard_api, }; use crate::{ AddSourceRequestExt, CreateIndexRequestExt, IndexMetadata, IndexMetadataResponseExt, ListIndexesMetadataResponseExt, ListSplitsRequestExt, ListSplitsResponseExt, - MetastoreServiceExt, Split, SplitState, StageSplitsRequestExt, UpdateIndexRequestExt, + MetastoreServiceExt, Split, SplitMetadata, SplitState, StageSplitsRequestExt, + UpdateIndexRequestExt, }; /// PostgreSQL metastore implementation. @@ -1165,6 +1168,124 @@ impl MetastoreService for PostgresqlMetastore { Ok(EmptyResponse {}) } + #[instrument(skip(self))] + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> MetastoreResult { + let index_uid: IndexUid = request.index_uid().clone(); + let split_doc_ids = request.split_doc_ids; + + if split_doc_ids.is_empty() { + return Ok(SoftDeleteDocumentsResponse { + num_soft_deleted_doc_ids: 0, + }); + } + + // Fetches current metadata for all requested splits in a single round-trip, locking + // the rows for the duration of the transaction. + const FETCH_SPLITS_METADATA_QUERY: &str = r#" + SELECT split_id, split_metadata_json + FROM splits + WHERE + index_uid = $1 + AND split_id = ANY($2) + AND split_state = 'Published' + FOR UPDATE + "#; + + // Updates all modified splits in a single round-trip via UNNEST. + const UPDATE_SPLITS_METADATA_QUERY: &str = r#" + UPDATE splits + SET + split_metadata_json = updates.split_metadata_json, + update_timestamp = (CURRENT_TIMESTAMP AT TIME ZONE 'UTC') + FROM UNNEST($1::TEXT[], $2::TEXT[]) AS updates(split_id, split_metadata_json) + WHERE + splits.index_uid = $3 + AND splits.split_id = updates.split_id + AND splits.split_state = 'Published' + "#; + + // Build a lookup map: split_id → new doc IDs to add. + let mut new_ids_by_split: HashMap<&str, BTreeSet> = HashMap::new(); + for split in &split_doc_ids { + let entry = new_ids_by_split.entry(split.split_id.as_str()).or_default(); + entry.extend(split.doc_ids.iter().copied()); + } + + let requested_split_ids: Vec<&str> = + split_doc_ids.iter().map(|s| s.split_id.as_str()).collect(); + + run_with_tx!(self.connection_pool, tx, "soft delete documents", { + // Phase 1: fetch and lock all relevant splits, merge new doc IDs, validate limits. + // Any error here causes the transaction to roll back, so no split is modified. + let rows: Vec<(String, String)> = sqlx::query_as(FETCH_SPLITS_METADATA_QUERY) + .bind(&index_uid) + .bind(&requested_split_ids) + .fetch_all(tx.as_mut()) + .await + .map_err(|sqlx_error| convert_sqlx_err(&index_uid.index_id, sqlx_error))?; + + let mut updated_split_ids: Vec = Vec::with_capacity(rows.len()); + let mut updated_metadata_jsons: Vec = Vec::with_capacity(rows.len()); + let mut total_soft_deleted: u64 = 0; + + for (split_id, split_metadata_json) in rows { + let new_ids = new_ids_by_split + .get(split_id.as_str()) + .cloned() + .unwrap_or_default(); + + let mut split_metadata = serde_json::from_str::( + &split_metadata_json, + ) + .map_err(|error| MetastoreError::JsonDeserializeError { + struct_name: "SplitMetadata".to_string(), + message: error.to_string(), + })?; + + let old_count = split_metadata.soft_deleted_doc_ids.len(); + split_metadata.soft_deleted_doc_ids.extend(new_ids); + let new_count = split_metadata.soft_deleted_doc_ids.len(); + if old_count == new_count { + continue; + } + + if new_count > MAX_SOFT_DELETED_DOCS_PER_SPLIT { + return Err(MetastoreError::FailedPrecondition { + entity: EntityKind::Split { + split_id: split_id.clone(), + }, + message: format!( + "split `{split_id}` would exceed the maximum number of soft-deleted \ + documents ({MAX_SOFT_DELETED_DOCS_PER_SPLIT}): would be {new_count}", + ), + }); + } + + updated_metadata_jsons.push(serde_utils::to_json_str(&split_metadata)?); + updated_split_ids.push(split_id); + total_soft_deleted += (new_count - old_count) as u64; + } + + // Phase 2: all validations passed — apply all updates in a single query. + if !updated_split_ids.is_empty() { + sqlx::query(UPDATE_SPLITS_METADATA_QUERY) + .bind(&updated_split_ids) + .bind(&updated_metadata_jsons) + .bind(&index_uid) + .execute(tx.as_mut()) + .await + .map_err(|sqlx_error| convert_sqlx_err(&index_uid.index_id, sqlx_error))?; + } + + Ok(SoftDeleteDocumentsResponse { + num_soft_deleted_doc_ids: total_soft_deleted, + }) + }) + } + #[instrument(skip(self))] async fn add_source(&self, request: AddSourceRequest) -> MetastoreResult { let source_config = request.deserialize_source_config()?; @@ -1765,6 +1886,39 @@ impl MetastoreService for PostgresqlMetastore { Ok(EmptyResponse {}) } + async fn get_kv(&self, request: GetKvRequest) -> MetastoreResult { + let value: Option<(String,)> = sqlx::query_as("SELECT value FROM kv WHERE key = $1") + .bind(&request.key) + .fetch_optional(&self.connection_pool) + .await?; + Ok(GetKvResponse { + value: value.map(|(v,)| v), + }) + } + + async fn set_kv(&self, request: SetKvRequest) -> MetastoreResult { + sqlx::query( + r" + INSERT INTO kv (key, value) + VALUES ($1, $2) + ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value + ", + ) + .bind(&request.key) + .bind(&request.value) + .execute(&self.connection_pool) + .await?; + Ok(EmptyResponse {}) + } + + async fn delete_kv(&self, request: DeleteKvRequest) -> MetastoreResult { + sqlx::query("DELETE FROM kv WHERE key = $1") + .bind(&request.key) + .execute(&self.connection_pool) + .await?; + Ok(EmptyResponse {}) + } + async fn get_cluster_identity( &self, _: GetClusterIdentityRequest, @@ -2241,6 +2395,18 @@ mod tests { sql.to_string(PostgresQueryBuilder), r#"SELECT * FROM "splits" WHERE "time_range_end" <= 42"# ); + + let mut select_statement = Query::select(); + let sql = select_statement.column(Asterisk).from(Splits::Table); + + let query = ListSplitsQuery::for_all_indexes() + .with_split_ids(vec!["split-1".to_string(), "split-2".to_string()]); + append_query_filters_and_order_by(sql, &query); + + assert_eq!( + sql.to_string(PostgresQueryBuilder), + r#"SELECT * FROM "splits" WHERE "split_id" IN ('split-1', 'split-2')"# + ); } #[test] diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/model.rs b/quickwit/quickwit-metastore/src/metastore/postgres/model.rs index 86853c531b4..8c605859f55 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/model.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/model.rs @@ -90,6 +90,7 @@ pub enum Splits { IndexUid, NodeId, DeleteOpstamp, + SoftDeletedDocIds, } pub(super) struct ToTimestampFunc; diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/utils.rs b/quickwit/quickwit-metastore/src/metastore/postgres/utils.rs index b5769201948..f0d87246c0c 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/utils.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/utils.rs @@ -107,6 +107,10 @@ pub(super) fn append_query_filters_and_order_by( sql.cond_where(Expr::col(Splits::IndexUid).is_in(index_uids)); } + if let Some(split_ids) = &query.split_ids { + sql.cond_where(Expr::col(Splits::SplitId).is_in(split_ids)); + } + if let Some(node_id) = &query.node_id { sql.cond_where(Expr::col(Splits::NodeId).eq(node_id)); }; diff --git a/quickwit/quickwit-metastore/src/split_metadata.rs b/quickwit/quickwit-metastore/src/split_metadata.rs index 829029e5d43..3de6f9122f4 100644 --- a/quickwit/quickwit-metastore/src/split_metadata.rs +++ b/quickwit/quickwit-metastore/src/split_metadata.rs @@ -135,6 +135,9 @@ pub struct SplitMetadata { /// Doc mapping UID used when creating this split. This split may only be merged with other /// splits using the same doc mapping UID. pub doc_mapping_uid: DocMappingUid, + + /// Set of tantivy doc_ids that have been soft-deleted from this split. + pub soft_deleted_doc_ids: BTreeSet, } impl fmt::Debug for SplitMetadata { @@ -180,6 +183,9 @@ impl fmt::Debug for SplitMetadata { debug_struct.field("footer_offsets", &self.footer_offsets); debug_struct.field("delete_opstamp", &self.delete_opstamp); debug_struct.field("num_merge_ops", &self.num_merge_ops); + if !self.soft_deleted_doc_ids.is_empty() { + debug_struct.field("soft_deleted_doc_ids", &self.soft_deleted_doc_ids); + } debug_struct.finish() } } @@ -286,6 +292,7 @@ impl quickwit_config::TestableForRegression for SplitMetadata { footer_offsets: 1000..2000, num_merge_ops: 3, doc_mapping_uid: DocMappingUid::default(), + soft_deleted_doc_ids: BTreeSet::new(), } } @@ -427,6 +434,7 @@ mod tests { delete_opstamp: 0, num_merge_ops: 0, doc_mapping_uid: DocMappingUid::default(), + soft_deleted_doc_ids: BTreeSet::new(), }; let expected_output = diff --git a/quickwit/quickwit-metastore/src/split_metadata_version.rs b/quickwit/quickwit-metastore/src/split_metadata_version.rs index 5f6204c85b7..43b38542133 100644 --- a/quickwit/quickwit-metastore/src/split_metadata_version.rs +++ b/quickwit/quickwit-metastore/src/split_metadata_version.rs @@ -97,6 +97,10 @@ pub(crate) struct SplitMetadataV0_8 { // splits before when updates first appeared are compatible with each other. #[serde(default)] doc_mapping_uid: DocMappingUid, + + /// Set of tantivy doc_ids that have been soft-deleted from this split. + #[serde(default)] + pub soft_deleted_doc_ids: BTreeSet, } impl From for SplitMetadata { @@ -134,6 +138,7 @@ impl From for SplitMetadata { footer_offsets: v8.footer_offsets, num_merge_ops: v8.num_merge_ops, doc_mapping_uid: v8.doc_mapping_uid, + soft_deleted_doc_ids: v8.soft_deleted_doc_ids, } } } @@ -157,6 +162,7 @@ impl From for SplitMetadataV0_8 { footer_offsets: split.footer_offsets, num_merge_ops: split.num_merge_ops, doc_mapping_uid: split.doc_mapping_uid, + soft_deleted_doc_ids: split.soft_deleted_doc_ids, } } } diff --git a/quickwit/quickwit-metastore/src/tests/kv.rs b/quickwit/quickwit-metastore/src/tests/kv.rs new file mode 100644 index 00000000000..32f0834b6f6 --- /dev/null +++ b/quickwit/quickwit-metastore/src/tests/kv.rs @@ -0,0 +1,232 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use quickwit_proto::metastore::{DeleteKvRequest, GetKvRequest, MetastoreService, SetKvRequest}; + +use super::DefaultForTest; + +pub async fn test_metastore_kv_set_get() { + let metastore = MetastoreToTest::default_for_test().await; + + // Set a key-value pair + let set_request = SetKvRequest { + key: "test-key".to_string(), + value: "test-value".to_string(), + }; + metastore.set_kv(set_request).await.unwrap(); + + // Get the key-value pair + let get_request = GetKvRequest { + key: "test-key".to_string(), + }; + let response = metastore.get_kv(get_request).await.unwrap(); + assert_eq!(response.value, Some("test-value".to_string())); +} + +pub async fn test_metastore_kv_get_non_existent< + MetastoreToTest: MetastoreService + DefaultForTest, +>() { + let metastore = MetastoreToTest::default_for_test().await; + + // Try to get a non-existent key + let get_request = GetKvRequest { + key: "non-existent-key".to_string(), + }; + let response = metastore.get_kv(get_request).await.unwrap(); + assert_eq!(response.value, None); +} + +pub async fn test_metastore_kv_set_overwrite() { + let metastore = MetastoreToTest::default_for_test().await; + + // Set a key-value pair + let set_request = SetKvRequest { + key: "test-key".to_string(), + value: "original-value".to_string(), + }; + metastore.set_kv(set_request).await.unwrap(); + + // Overwrite with new value + let set_request = SetKvRequest { + key: "test-key".to_string(), + value: "updated-value".to_string(), + }; + metastore.set_kv(set_request).await.unwrap(); + + // Verify the value was updated + let get_request = GetKvRequest { + key: "test-key".to_string(), + }; + let response = metastore.get_kv(get_request).await.unwrap(); + assert_eq!(response.value, Some("updated-value".to_string())); +} + +pub async fn test_metastore_kv_delete() { + let metastore = MetastoreToTest::default_for_test().await; + + // Set a key-value pair + let set_request = SetKvRequest { + key: "test-key".to_string(), + value: "test-value".to_string(), + }; + metastore.set_kv(set_request).await.unwrap(); + + // Verify it exists + let get_request = GetKvRequest { + key: "test-key".to_string(), + }; + let response = metastore.get_kv(get_request).await.unwrap(); + assert_eq!(response.value, Some("test-value".to_string())); + + // Delete the key + let delete_request = DeleteKvRequest { + key: "test-key".to_string(), + }; + metastore.delete_kv(delete_request).await.unwrap(); + + // Verify it no longer exists + let get_request = GetKvRequest { + key: "test-key".to_string(), + }; + let response = metastore.get_kv(get_request).await.unwrap(); + assert_eq!(response.value, None); +} + +pub async fn test_metastore_kv_delete_non_existent< + MetastoreToTest: MetastoreService + DefaultForTest, +>() { + let metastore = MetastoreToTest::default_for_test().await; + + // Delete a non-existent key (should succeed without error) + let delete_request = DeleteKvRequest { + key: "non-existent-key".to_string(), + }; + metastore.delete_kv(delete_request).await.unwrap(); +} + +pub async fn test_metastore_kv_multiple_keys() { + let metastore = MetastoreToTest::default_for_test().await; + + // Set multiple key-value pairs + let set_request_1 = SetKvRequest { + key: "key-1".to_string(), + value: "value-1".to_string(), + }; + metastore.set_kv(set_request_1).await.unwrap(); + + let set_request_2 = SetKvRequest { + key: "key-2".to_string(), + value: "value-2".to_string(), + }; + metastore.set_kv(set_request_2).await.unwrap(); + + let set_request_3 = SetKvRequest { + key: "key-3".to_string(), + value: "value-3".to_string(), + }; + metastore.set_kv(set_request_3).await.unwrap(); + + // Verify all keys exist + let get_request_1 = GetKvRequest { + key: "key-1".to_string(), + }; + let response_1 = metastore.get_kv(get_request_1).await.unwrap(); + assert_eq!(response_1.value, Some("value-1".to_string())); + + let get_request_2 = GetKvRequest { + key: "key-2".to_string(), + }; + let response_2 = metastore.get_kv(get_request_2).await.unwrap(); + assert_eq!(response_2.value, Some("value-2".to_string())); + + let get_request_3 = GetKvRequest { + key: "key-3".to_string(), + }; + let response_3 = metastore.get_kv(get_request_3).await.unwrap(); + assert_eq!(response_3.value, Some("value-3".to_string())); + + // Delete one key + let delete_request = DeleteKvRequest { + key: "key-2".to_string(), + }; + metastore.delete_kv(delete_request).await.unwrap(); + + // Verify key-2 is deleted but others remain + let get_request_1 = GetKvRequest { + key: "key-1".to_string(), + }; + let response_1 = metastore.get_kv(get_request_1).await.unwrap(); + assert_eq!(response_1.value, Some("value-1".to_string())); + + let get_request_2 = GetKvRequest { + key: "key-2".to_string(), + }; + let response_2 = metastore.get_kv(get_request_2).await.unwrap(); + assert_eq!(response_2.value, None); + + let get_request_3 = GetKvRequest { + key: "key-3".to_string(), + }; + let response_3 = metastore.get_kv(get_request_3).await.unwrap(); + assert_eq!(response_3.value, Some("value-3".to_string())); +} + +pub async fn test_metastore_kv_empty_key() { + let metastore = MetastoreToTest::default_for_test().await; + + // Set a key-value pair with an empty key + let set_request = SetKvRequest { + key: "".to_string(), + value: "empty-key-value".to_string(), + }; + metastore.set_kv(set_request).await.unwrap(); + + // Get the empty key + let get_request = GetKvRequest { + key: "".to_string(), + }; + let response = metastore.get_kv(get_request).await.unwrap(); + assert_eq!(response.value, Some("empty-key-value".to_string())); + + // Delete the empty key + let delete_request = DeleteKvRequest { + key: "".to_string(), + }; + metastore.delete_kv(delete_request).await.unwrap(); + + // Verify it's deleted + let get_request = GetKvRequest { + key: "".to_string(), + }; + let response = metastore.get_kv(get_request).await.unwrap(); + assert_eq!(response.value, None); +} + +pub async fn test_metastore_kv_empty_value() { + let metastore = MetastoreToTest::default_for_test().await; + + // Set a key-value pair with an empty value + let set_request = SetKvRequest { + key: "test-key".to_string(), + value: String::new(), + }; + metastore.set_kv(set_request).await.unwrap(); + + // Get the key with empty value + let get_request = GetKvRequest { + key: "test-key".to_string(), + }; + let response = metastore.get_kv(get_request).await.unwrap(); + assert_eq!(response.value, Some(String::new())); +} diff --git a/quickwit/quickwit-metastore/src/tests/mod.rs b/quickwit/quickwit-metastore/src/tests/mod.rs index d6e549baf25..c6177da1b43 100644 --- a/quickwit/quickwit-metastore/src/tests/mod.rs +++ b/quickwit/quickwit-metastore/src/tests/mod.rs @@ -28,6 +28,7 @@ use quickwit_proto::types::IndexUid; pub(crate) mod delete_task; pub(crate) mod get_identity; pub(crate) mod index; +pub(crate) mod kv; pub(crate) mod list_splits; pub(crate) mod shard; pub(crate) mod source; @@ -575,6 +576,94 @@ macro_rules! metastore_test_suite { let _ = tracing_subscriber::fmt::try_init(); $crate::tests::get_identity::test_metastore_get_identity::<$metastore_type>().await; } + + /// KV API tests + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_kv_set_get() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::kv::test_metastore_kv_set_get::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_kv_get_non_existent() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::kv::test_metastore_kv_get_non_existent::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_kv_set_overwrite() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::kv::test_metastore_kv_set_overwrite::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_kv_delete() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::kv::test_metastore_kv_delete::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_kv_delete_non_existent() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::kv::test_metastore_kv_delete_non_existent::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_kv_multiple_keys() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::kv::test_metastore_kv_multiple_keys::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_kv_empty_key() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::kv::test_metastore_kv_empty_key::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_kv_empty_value() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::kv::test_metastore_kv_empty_value::<$metastore_type>().await; + } + + /// Soft-delete documents API tests + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_soft_delete_documents() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::split::test_metastore_soft_delete_documents::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_soft_delete_documents_idempotent() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::split::test_metastore_soft_delete_documents_idempotent::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_soft_delete_documents_non_published_split() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::split::test_metastore_soft_delete_documents_non_published_split::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_soft_delete_documents_limit_exceeded() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::split::test_metastore_soft_delete_documents_limit_exceeded::<$metastore_type>().await; + } } }; } diff --git a/quickwit/quickwit-metastore/src/tests/split.rs b/quickwit/quickwit-metastore/src/tests/split.rs index 9e6d45265e3..16a96905936 100644 --- a/quickwit/quickwit-metastore/src/tests/split.rs +++ b/quickwit/quickwit-metastore/src/tests/split.rs @@ -20,7 +20,7 @@ use quickwit_config::{IndexConfig, SourceConfig, SourceParams}; use quickwit_proto::metastore::{ CreateIndexRequest, DeleteSplitsRequest, EntityKind, IndexMetadataRequest, ListSplitsRequest, ListStaleSplitsRequest, MarkSplitsForDeletionRequest, MetastoreError, PublishSplitsRequest, - StageSplitsRequest, UpdateSplitsDeleteOpstampRequest, + SoftDeleteDocumentsRequest, SplitDocIds, StageSplitsRequest, UpdateSplitsDeleteOpstampRequest, }; use quickwit_proto::types::{IndexUid, Position}; use time::OffsetDateTime; @@ -29,7 +29,7 @@ use tracing::{error, info}; use super::DefaultForTest; use crate::checkpoint::{IndexCheckpointDelta, PartitionId, SourceCheckpointDelta}; -use crate::metastore::MetastoreServiceStreamSplitsExt; +use crate::metastore::{MAX_SOFT_DELETED_DOCS_PER_SPLIT, MetastoreServiceStreamSplitsExt}; use crate::tests::cleanup_index; use crate::{ CreateIndexRequestExt, IndexMetadataResponseExt, ListSplitsQuery, ListSplitsRequestExt, @@ -1806,3 +1806,431 @@ pub async fn test_metastore_update_splits_delete_opstamp< cleanup_index(&mut metastore, index_uid).await; } } + +pub async fn test_metastore_soft_delete_documents< + MetastoreToTest: MetastoreServiceExt + DefaultForTest, +>() { + let mut metastore = MetastoreToTest::default_for_test().await; + + let index_id = append_random_suffix("test-soft-delete-docs"); + let index_uri = format!("ram:///indexes/{index_id}"); + let index_config = IndexConfig::for_test(&index_id, &index_uri); + + let create_index_request = CreateIndexRequest::try_from_index_config(&index_config).unwrap(); + let index_uid: IndexUid = metastore + .create_index(create_index_request) + .await + .unwrap() + .index_uid() + .clone(); + + let split_id = format!("{index_id}--split-1"); + let split_metadata = SplitMetadata { + split_id: split_id.clone(), + index_uid: index_uid.clone(), + ..Default::default() + }; + + let stage_splits_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &split_metadata).unwrap(); + metastore.stage_splits(stage_splits_request).await.unwrap(); + + let publish_splits_request = PublishSplitsRequest { + index_uid: Some(index_uid.clone()), + staged_split_ids: vec![split_id.clone()], + ..Default::default() + }; + metastore + .publish_splits(publish_splits_request) + .await + .unwrap(); + + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: split_id.clone(), + doc_ids: vec![1, 5, 42], + }], + }; + let response = metastore + .soft_delete_documents(soft_delete_request) + .await + .unwrap(); + assert!(response.num_soft_deleted_doc_ids > 0); + + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + assert_eq!(splits.len(), 1); + let soft_deleted = &splits[0].split_metadata.soft_deleted_doc_ids; + assert!(soft_deleted.contains(&1)); + assert!(soft_deleted.contains(&5)); + assert!(soft_deleted.contains(&42)); + assert_eq!(soft_deleted.len(), 3); + + cleanup_index(&mut metastore, index_uid).await; +} + +pub async fn test_metastore_soft_delete_documents_idempotent< + MetastoreToTest: MetastoreServiceExt + DefaultForTest, +>() { + let mut metastore = MetastoreToTest::default_for_test().await; + + let index_id = append_random_suffix("test-soft-delete-idempotent"); + let index_uri = format!("ram:///indexes/{index_id}"); + let index_config = IndexConfig::for_test(&index_id, &index_uri); + + let create_index_request = CreateIndexRequest::try_from_index_config(&index_config).unwrap(); + let index_uid: IndexUid = metastore + .create_index(create_index_request) + .await + .unwrap() + .index_uid() + .clone(); + + let split_id = format!("{index_id}--split-1"); + let split_metadata = SplitMetadata { + split_id: split_id.clone(), + index_uid: index_uid.clone(), + ..Default::default() + }; + + let stage_splits_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &split_metadata).unwrap(); + metastore.stage_splits(stage_splits_request).await.unwrap(); + + let publish_splits_request = PublishSplitsRequest { + index_uid: Some(index_uid.clone()), + staged_split_ids: vec![split_id.clone()], + ..Default::default() + }; + metastore + .publish_splits(publish_splits_request) + .await + .unwrap(); + + // First call: soft-delete doc IDs [1, 2, 3]. + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: split_id.clone(), + doc_ids: vec![1, 2, 3], + }], + }; + metastore + .soft_delete_documents(soft_delete_request) + .await + .unwrap(); + + // Second call: same doc IDs — must not return an error. + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: split_id.clone(), + doc_ids: vec![1, 2, 3], + }], + }; + metastore + .soft_delete_documents(soft_delete_request) + .await + .unwrap(); + + // The set of soft-deleted IDs must still be exactly {1, 2, 3}. + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + assert_eq!(splits.len(), 1); + let soft_deleted = &splits[0].split_metadata.soft_deleted_doc_ids; + assert_eq!(soft_deleted.len(), 3); + assert!(soft_deleted.contains(&1)); + assert!(soft_deleted.contains(&2)); + assert!(soft_deleted.contains(&3)); + + // Third call: same IDs plus one new one — must extend the set by exactly one. + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: split_id.clone(), + doc_ids: vec![1, 2, 3, 4], + }], + }; + metastore + .soft_delete_documents(soft_delete_request) + .await + .unwrap(); + + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + assert_eq!(splits.len(), 1); + let soft_deleted = &splits[0].split_metadata.soft_deleted_doc_ids; + assert_eq!(soft_deleted.len(), 4); + assert!(soft_deleted.contains(&1)); + assert!(soft_deleted.contains(&2)); + assert!(soft_deleted.contains(&3)); + assert!(soft_deleted.contains(&4)); + + cleanup_index(&mut metastore, index_uid).await; +} + +pub async fn test_metastore_soft_delete_documents_non_published_split< + MetastoreToTest: MetastoreServiceExt + DefaultForTest, +>() { + let mut metastore = MetastoreToTest::default_for_test().await; + + let index_id = append_random_suffix("test-soft-delete-unpublished"); + let index_uri = format!("ram:///indexes/{index_id}"); + let index_config = IndexConfig::for_test(&index_id, &index_uri); + + let create_index_request = CreateIndexRequest::try_from_index_config(&index_config).unwrap(); + let index_uid: IndexUid = metastore + .create_index(create_index_request) + .await + .unwrap() + .index_uid() + .clone(); + + // Stage a split but do NOT publish it. + let staged_split_id = format!("{index_id}--split1"); + let staged_split_metadata = SplitMetadata { + split_id: staged_split_id.clone(), + index_uid: index_uid.clone(), + ..Default::default() + }; + let stage_splits_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &staged_split_metadata) + .unwrap(); + metastore.stage_splits(stage_splits_request).await.unwrap(); + + // Stage, publish, then mark another split for deletion. + let marked_split_id = format!("{index_id}--split2"); + let marked_split_metadata = SplitMetadata { + split_id: marked_split_id.clone(), + index_uid: index_uid.clone(), + ..Default::default() + }; + let stage_splits_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &marked_split_metadata) + .unwrap(); + metastore.stage_splits(stage_splits_request).await.unwrap(); + + let publish_splits_request = PublishSplitsRequest { + index_uid: Some(index_uid.clone()), + staged_split_ids: vec![marked_split_id.clone()], + ..Default::default() + }; + metastore + .publish_splits(publish_splits_request) + .await + .unwrap(); + + let mark_for_deletion_request = + MarkSplitsForDeletionRequest::new(index_uid.clone(), vec![marked_split_id.clone()]); + metastore + .mark_splits_for_deletion(mark_for_deletion_request) + .await + .unwrap(); + + // Attempt to soft-delete documents on the staged split. + // Implementations may return an error (file-backed) or silently skip (postgres) — both are + // valid. What matters is that the split's soft_deleted_doc_ids remains unmodified. + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: staged_split_id.clone(), + doc_ids: vec![10, 20], + }], + }; + let _ = metastore.soft_delete_documents(soft_delete_request).await; + + let list_staged_request = ListSplitsRequest::try_from_list_splits_query( + &ListSplitsQuery::for_index(index_uid.clone()).with_split_state(SplitState::Staged), + ) + .unwrap(); + let staged_splits = metastore + .list_splits(list_staged_request) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + assert_eq!(staged_splits.len(), 1); + assert!( + staged_splits[0] + .split_metadata + .soft_deleted_doc_ids + .is_empty(), + "staged split must not have any soft-deleted doc IDs" + ); + + // Attempt to soft-delete documents on the marked-for-deletion split. + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: marked_split_id.clone(), + doc_ids: vec![30, 40], + }], + }; + let _ = metastore.soft_delete_documents(soft_delete_request).await; + + let list_marked_request = ListSplitsRequest::try_from_list_splits_query( + &ListSplitsQuery::for_index(index_uid.clone()) + .with_split_state(SplitState::MarkedForDeletion), + ) + .unwrap(); + let marked_splits = metastore + .list_splits(list_marked_request) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + assert_eq!(marked_splits.len(), 1); + assert!( + marked_splits[0] + .split_metadata + .soft_deleted_doc_ids + .is_empty(), + "marked-for-deletion split must not have any soft-deleted doc IDs" + ); + + cleanup_index(&mut metastore, index_uid).await; +} + +pub async fn test_metastore_soft_delete_documents_limit_exceeded< + MetastoreToTest: MetastoreServiceExt + DefaultForTest, +>() { + let mut metastore = MetastoreToTest::default_for_test().await; + + let index_id = append_random_suffix("test-soft-delete-limit"); + let index_uri = format!("ram:///indexes/{index_id}"); + let index_config = IndexConfig::for_test(&index_id, &index_uri); + + let create_index_request = CreateIndexRequest::try_from_index_config(&index_config).unwrap(); + let index_uid: IndexUid = metastore + .create_index(create_index_request) + .await + .unwrap() + .index_uid() + .clone(); + + // Create and publish two splits. + let split_a_id = format!("{index_id}--split-a"); + let split_b_id = format!("{index_id}--split-b"); + + for split_id in [&split_a_id, &split_b_id] { + let split_metadata = SplitMetadata { + split_id: split_id.clone(), + index_uid: index_uid.clone(), + ..Default::default() + }; + let stage_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &split_metadata) + .unwrap(); + metastore.stage_splits(stage_request).await.unwrap(); + + let publish_request = PublishSplitsRequest { + index_uid: Some(index_uid.clone()), + staged_split_ids: vec![split_id.clone()], + ..Default::default() + }; + metastore.publish_splits(publish_request).await.unwrap(); + } + + // Pre-populate split-b with MAX - 1 soft-deleted doc IDs so one more would be fine but two + // would exceed the limit. + let initial_ids: Vec = (0..MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32 - 1).collect(); + let pre_populate_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: split_b_id.clone(), + doc_ids: initial_ids, + }], + }; + metastore + .soft_delete_documents(pre_populate_request) + .await + .unwrap(); + + // Request that would: + // - soft-delete 1 doc on split-a (valid on its own) + // - soft-delete 2 *new* docs on split-b (would push total from MAX-1 to MAX+1) + // The whole request must fail and neither split must be modified. + let overflow_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![ + SplitDocIds { + split_id: split_a_id.clone(), + doc_ids: vec![100], + }, + SplitDocIds { + split_id: split_b_id.clone(), + doc_ids: vec![ + MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32 - 1, + MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32, + ], + }, + ], + }; + let error = metastore + .soft_delete_documents(overflow_request) + .await + .unwrap_err(); + assert!( + matches!( + error, + MetastoreError::FailedPrecondition { + entity: EntityKind::Split { .. }, + .. + } + ), + "expected FailedPrecondition when soft-deleted doc limit is exceeded, got: {error:?}" + ); + + // Verify atomicity: both splits must be unmodified after the failed request. + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + let split_a = splits + .iter() + .find(|s| s.split_metadata.split_id == split_a_id) + .expect("split-a must exist"); + assert!( + split_a.split_metadata.soft_deleted_doc_ids.is_empty(), + "split-a must not have been modified (atomicity guarantee)" + ); + + let split_b = splits + .iter() + .find(|s| s.split_metadata.split_id == split_b_id) + .expect("split-b must exist"); + assert_eq!( + split_b.split_metadata.soft_deleted_doc_ids.len(), + MAX_SOFT_DELETED_DOCS_PER_SPLIT - 1, + "split-b must not have been modified (atomicity guarantee)" + ); + + cleanup_index(&mut metastore, index_uid).await; +} diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json index 9f7f0e27f23..0f708ac16b3 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json @@ -1,200 +1,201 @@ { - "version": "0.9", + "delete_tasks": [ + { + "create_timestamp": 0, + "delete_query": { + "index_uid": "my-index:00000000000000000000000000", + "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" + }, + "opstamp": 10 + } + ], "index": { - "version": "0.9", - "index_uid": "my-index:00000000000000000000000000", + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, "index_config": { - "version": "0.9", - "index_id": "my-index", - "index_uri": "s3://quickwit-indexes/my-index", "doc_mapping": { "doc_mapping_uid": "00000000000000000000000000", - "mode": "dynamic", "dynamic_mapping": { - "indexed": true, - "tokenizer": "raw", - "record": "basic", - "stored": true, "expand_dots": true, "fast": { "normalizer": "raw" - } + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" }, "field_mappings": [ { + "coerce": true, + "fast": true, + "indexed": true, "name": "tenant_id", - "type": "u64", + "output_format": "number", "stored": true, - "indexed": true, - "fast": true, - "coerce": true, - "output_format": "number" + "type": "u64" }, { - "name": "timestamp", - "type": "datetime", + "fast": true, + "fast_precision": "seconds", + "indexed": true, "input_formats": [ "rfc3339", "unix_timestamp" ], + "name": "timestamp", "output_format": "rfc3339", - "fast_precision": "seconds", - "indexed": true, "stored": true, - "fast": true + "type": "datetime" }, { - "name": "log_level", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "raw", + "name": "log_level", "record": "basic", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "raw", + "type": "text" }, { - "name": "message", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "default", + "name": "message", "record": "position", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "default", + "type": "text" } ], - "timestamp_field": "timestamp", + "index_field_presence": true, + "max_num_partitions": 100, + "mode": "dynamic", + "partition_key": "tenant_id", + "store_document_size": false, + "store_source": true, "tag_fields": [ "log_level", "tenant_id" ], - "partition_key": "tenant_id", - "max_num_partitions": 100, - "index_field_presence": true, - "store_document_size": false, - "store_source": true, + "timestamp_field": "timestamp", "tokenizers": [ { + "filters": [], "name": "custom_tokenizer", - "type": "regex", "pattern": "[^\\p{L}\\p{N}]+", - "filters": [] + "type": "regex" } ] }, + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", "indexing_settings": { "commit_timeout_secs": 301, - "docstore_compression_level": 8, "docstore_blocksize": 1000000, - "split_num_docs_target": 10000001, + "docstore_compression_level": 8, "merge_policy": { - "type": "stable_log", - "min_level_num_docs": 100000, - "merge_factor": 9, + "maturation_period": "2days", "max_merge_factor": 11, - "maturation_period": "2days" + "merge_factor": 9, + "min_level_num_docs": 100000, + "type": "stable_log" }, "resources": { "heap_size": "50.0 MB" - } + }, + "split_num_docs_target": 10000001 }, "ingest_settings": { "min_shards": 1 }, + "retention": { + "period": "90 days", + "schedule": "daily" + }, "search_settings": { "default_search_fields": [ "message" ] }, - "retention": { - "period": "90 days", - "schedule": "daily" - } - }, - "checkpoint": { - "kafka-source": { - "00000000000000000000": "00000000000000000042" - } + "version": "0.9" }, - "create_timestamp": 1789, + "index_uid": "my-index:00000000000000000000000000", "sources": [ { - "version": "0.9", - "source_id": "kafka-source", - "num_pipelines": 2, "enabled": true, - "source_type": "kafka", + "input_format": "json", + "num_pipelines": 2, "params": { - "topic": "kafka-topic", - "client_params": {} + "client_params": {}, + "topic": "kafka-topic" }, + "source_id": "kafka-source", + "source_type": "kafka", "transform": { "script": ".message = downcase(string!(.message))", "timezone": "UTC" }, - "input_format": "json" + "version": "0.9" + } + ], + "version": "0.9" + }, + "shards": { + "_ingest-source": [ + { + "doc_mapping_uid": "00000000000000000000000000", + "follower_id": "follower-ingester", + "index_uid": "my-index:00000000000000000000000000", + "leader_id": "leader-ingester", + "publish_position_inclusive": "", + "shard_id": "00000000000000000001", + "shard_state": 1, + "source_id": "_ingest-source", + "update_timestamp": 1704067200 } ] }, "splits": [ { - "split_state": "Published", - "update_timestamp": 1789, - "publish_timestamp": 1789, - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000000", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000000", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "publish_timestamp": 1789, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", + "split_state": "Published", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "update_timestamp": 1789, + "version": "0.9" } ], - "shards": { - "_ingest-source": [ - { - "index_uid": "my-index:00000000000000000000000000", - "source_id": "_ingest-source", - "shard_id": "00000000000000000001", - "leader_id": "leader-ingester", - "follower_id": "follower-ingester", - "shard_state": 1, - "publish_position_inclusive": "", - "doc_mapping_uid": "00000000000000000000000000", - "update_timestamp": 1704067200 - } - ] - }, - "delete_tasks": [ - { - "create_timestamp": 0, - "opstamp": 10, - "delete_query": { - "index_uid": "my-index:00000000000000000000000000", - "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" - } - } - ] + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json index 9f7f0e27f23..0f708ac16b3 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json @@ -1,200 +1,201 @@ { - "version": "0.9", + "delete_tasks": [ + { + "create_timestamp": 0, + "delete_query": { + "index_uid": "my-index:00000000000000000000000000", + "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" + }, + "opstamp": 10 + } + ], "index": { - "version": "0.9", - "index_uid": "my-index:00000000000000000000000000", + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, "index_config": { - "version": "0.9", - "index_id": "my-index", - "index_uri": "s3://quickwit-indexes/my-index", "doc_mapping": { "doc_mapping_uid": "00000000000000000000000000", - "mode": "dynamic", "dynamic_mapping": { - "indexed": true, - "tokenizer": "raw", - "record": "basic", - "stored": true, "expand_dots": true, "fast": { "normalizer": "raw" - } + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" }, "field_mappings": [ { + "coerce": true, + "fast": true, + "indexed": true, "name": "tenant_id", - "type": "u64", + "output_format": "number", "stored": true, - "indexed": true, - "fast": true, - "coerce": true, - "output_format": "number" + "type": "u64" }, { - "name": "timestamp", - "type": "datetime", + "fast": true, + "fast_precision": "seconds", + "indexed": true, "input_formats": [ "rfc3339", "unix_timestamp" ], + "name": "timestamp", "output_format": "rfc3339", - "fast_precision": "seconds", - "indexed": true, "stored": true, - "fast": true + "type": "datetime" }, { - "name": "log_level", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "raw", + "name": "log_level", "record": "basic", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "raw", + "type": "text" }, { - "name": "message", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "default", + "name": "message", "record": "position", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "default", + "type": "text" } ], - "timestamp_field": "timestamp", + "index_field_presence": true, + "max_num_partitions": 100, + "mode": "dynamic", + "partition_key": "tenant_id", + "store_document_size": false, + "store_source": true, "tag_fields": [ "log_level", "tenant_id" ], - "partition_key": "tenant_id", - "max_num_partitions": 100, - "index_field_presence": true, - "store_document_size": false, - "store_source": true, + "timestamp_field": "timestamp", "tokenizers": [ { + "filters": [], "name": "custom_tokenizer", - "type": "regex", "pattern": "[^\\p{L}\\p{N}]+", - "filters": [] + "type": "regex" } ] }, + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", "indexing_settings": { "commit_timeout_secs": 301, - "docstore_compression_level": 8, "docstore_blocksize": 1000000, - "split_num_docs_target": 10000001, + "docstore_compression_level": 8, "merge_policy": { - "type": "stable_log", - "min_level_num_docs": 100000, - "merge_factor": 9, + "maturation_period": "2days", "max_merge_factor": 11, - "maturation_period": "2days" + "merge_factor": 9, + "min_level_num_docs": 100000, + "type": "stable_log" }, "resources": { "heap_size": "50.0 MB" - } + }, + "split_num_docs_target": 10000001 }, "ingest_settings": { "min_shards": 1 }, + "retention": { + "period": "90 days", + "schedule": "daily" + }, "search_settings": { "default_search_fields": [ "message" ] }, - "retention": { - "period": "90 days", - "schedule": "daily" - } - }, - "checkpoint": { - "kafka-source": { - "00000000000000000000": "00000000000000000042" - } + "version": "0.9" }, - "create_timestamp": 1789, + "index_uid": "my-index:00000000000000000000000000", "sources": [ { - "version": "0.9", - "source_id": "kafka-source", - "num_pipelines": 2, "enabled": true, - "source_type": "kafka", + "input_format": "json", + "num_pipelines": 2, "params": { - "topic": "kafka-topic", - "client_params": {} + "client_params": {}, + "topic": "kafka-topic" }, + "source_id": "kafka-source", + "source_type": "kafka", "transform": { "script": ".message = downcase(string!(.message))", "timezone": "UTC" }, - "input_format": "json" + "version": "0.9" + } + ], + "version": "0.9" + }, + "shards": { + "_ingest-source": [ + { + "doc_mapping_uid": "00000000000000000000000000", + "follower_id": "follower-ingester", + "index_uid": "my-index:00000000000000000000000000", + "leader_id": "leader-ingester", + "publish_position_inclusive": "", + "shard_id": "00000000000000000001", + "shard_state": 1, + "source_id": "_ingest-source", + "update_timestamp": 1704067200 } ] }, "splits": [ { - "split_state": "Published", - "update_timestamp": 1789, - "publish_timestamp": 1789, - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000000", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000000", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "publish_timestamp": 1789, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", + "split_state": "Published", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "update_timestamp": 1789, + "version": "0.9" } ], - "shards": { - "_ingest-source": [ - { - "index_uid": "my-index:00000000000000000000000000", - "source_id": "_ingest-source", - "shard_id": "00000000000000000001", - "leader_id": "leader-ingester", - "follower_id": "follower-ingester", - "shard_state": 1, - "publish_position_inclusive": "", - "doc_mapping_uid": "00000000000000000000000000", - "update_timestamp": 1704067200 - } - ] - }, - "delete_tasks": [ - { - "create_timestamp": 0, - "opstamp": 10, - "delete_query": { - "index_uid": "my-index:00000000000000000000000000", - "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" - } - } - ] + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.expected.json index f9ecb6a7bcb..2d60feec007 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.expected.json @@ -1,200 +1,201 @@ { - "version": "0.9", + "delete_tasks": [ + { + "create_timestamp": 0, + "delete_query": { + "index_uid": "my-index:00000000000000000000000001", + "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false}]}" + }, + "opstamp": 10 + } + ], "index": { - "version": "0.9", - "index_uid": "my-index:00000000000000000000000001", + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, "index_config": { - "version": "0.9", - "index_id": "my-index", - "index_uri": "s3://quickwit-indexes/my-index", "doc_mapping": { "doc_mapping_uid": "00000000000000000000000001", - "mode": "dynamic", "dynamic_mapping": { - "indexed": true, - "tokenizer": "raw", - "record": "basic", - "stored": true, "expand_dots": true, "fast": { "normalizer": "raw" - } + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" }, "field_mappings": [ { + "coerce": true, + "fast": true, + "indexed": true, "name": "tenant_id", - "type": "u64", + "output_format": "number", "stored": true, - "indexed": true, - "fast": true, - "coerce": true, - "output_format": "number" + "type": "u64" }, { - "name": "timestamp", - "type": "datetime", + "fast": true, + "fast_precision": "seconds", + "indexed": true, "input_formats": [ "rfc3339", "unix_timestamp" ], + "name": "timestamp", "output_format": "rfc3339", - "fast_precision": "seconds", - "indexed": true, "stored": true, - "fast": true + "type": "datetime" }, { - "name": "log_level", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "raw", + "name": "log_level", "record": "basic", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "raw", + "type": "text" }, { - "name": "message", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "default", + "name": "message", "record": "position", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "default", + "type": "text" } ], - "timestamp_field": "timestamp", + "index_field_presence": true, + "max_num_partitions": 100, + "mode": "dynamic", + "partition_key": "tenant_id", + "store_document_size": false, + "store_source": true, "tag_fields": [ "log_level", "tenant_id" ], - "partition_key": "tenant_id", - "max_num_partitions": 100, - "index_field_presence": true, - "store_document_size": false, - "store_source": true, + "timestamp_field": "timestamp", "tokenizers": [ { + "filters": [], "name": "custom_tokenizer", - "type": "regex", "pattern": "[^\\p{L}\\p{N}]+", - "filters": [] + "type": "regex" } ] }, + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", "indexing_settings": { "commit_timeout_secs": 301, - "docstore_compression_level": 8, "docstore_blocksize": 1000000, - "split_num_docs_target": 10000001, + "docstore_compression_level": 8, "merge_policy": { - "type": "stable_log", - "min_level_num_docs": 100000, - "merge_factor": 9, + "maturation_period": "2days", "max_merge_factor": 11, - "maturation_period": "2days" + "merge_factor": 9, + "min_level_num_docs": 100000, + "type": "stable_log" }, "resources": { "heap_size": "50.0 MB" - } + }, + "split_num_docs_target": 10000001 }, "ingest_settings": { "min_shards": 12 }, + "retention": { + "period": "90 days", + "schedule": "daily" + }, "search_settings": { "default_search_fields": [ "message" ] }, - "retention": { - "period": "90 days", - "schedule": "daily" - } - }, - "checkpoint": { - "kafka-source": { - "00000000000000000000": "00000000000000000042" - } + "version": "0.9" }, - "create_timestamp": 1789, + "index_uid": "my-index:00000000000000000000000001", "sources": [ { - "version": "0.9", - "source_id": "kafka-source", - "num_pipelines": 2, "enabled": true, - "source_type": "kafka", + "input_format": "json", + "num_pipelines": 2, "params": { - "topic": "kafka-topic", - "client_params": {} + "client_params": {}, + "topic": "kafka-topic" }, + "source_id": "kafka-source", + "source_type": "kafka", "transform": { "script": ".message = downcase(string!(.message))", "timezone": "UTC" }, - "input_format": "json" + "version": "0.9" + } + ], + "version": "0.9" + }, + "shards": { + "_ingest-source": [ + { + "doc_mapping_uid": "00000000000000000000000001", + "follower_id": "follower-ingester", + "index_uid": "my-index:00000000000000000000000001", + "leader_id": "leader-ingester", + "publish_position_inclusive": "", + "shard_id": "00000000000000000001", + "shard_state": 1, + "source_id": "_ingest-source", + "update_timestamp": 1724240908 } ] }, "splits": [ { - "split_state": "Published", - "update_timestamp": 1789, - "publish_timestamp": 1789, - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000001", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000001", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "publish_timestamp": 1789, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", + "split_state": "Published", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "update_timestamp": 1789, + "version": "0.9" } ], - "shards": { - "_ingest-source": [ - { - "index_uid": "my-index:00000000000000000000000001", - "source_id": "_ingest-source", - "shard_id": "00000000000000000001", - "leader_id": "leader-ingester", - "follower_id": "follower-ingester", - "shard_state": 1, - "publish_position_inclusive": "", - "doc_mapping_uid": "00000000000000000000000001", - "update_timestamp": 1724240908 - } - ] - }, - "delete_tasks": [ - { - "create_timestamp": 0, - "opstamp": 10, - "delete_query": { - "index_uid": "my-index:00000000000000000000000001", - "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false}]}" - } - } - ] + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.json index f9ecb6a7bcb..2d60feec007 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.json @@ -1,200 +1,201 @@ { - "version": "0.9", + "delete_tasks": [ + { + "create_timestamp": 0, + "delete_query": { + "index_uid": "my-index:00000000000000000000000001", + "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false}]}" + }, + "opstamp": 10 + } + ], "index": { - "version": "0.9", - "index_uid": "my-index:00000000000000000000000001", + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, "index_config": { - "version": "0.9", - "index_id": "my-index", - "index_uri": "s3://quickwit-indexes/my-index", "doc_mapping": { "doc_mapping_uid": "00000000000000000000000001", - "mode": "dynamic", "dynamic_mapping": { - "indexed": true, - "tokenizer": "raw", - "record": "basic", - "stored": true, "expand_dots": true, "fast": { "normalizer": "raw" - } + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" }, "field_mappings": [ { + "coerce": true, + "fast": true, + "indexed": true, "name": "tenant_id", - "type": "u64", + "output_format": "number", "stored": true, - "indexed": true, - "fast": true, - "coerce": true, - "output_format": "number" + "type": "u64" }, { - "name": "timestamp", - "type": "datetime", + "fast": true, + "fast_precision": "seconds", + "indexed": true, "input_formats": [ "rfc3339", "unix_timestamp" ], + "name": "timestamp", "output_format": "rfc3339", - "fast_precision": "seconds", - "indexed": true, "stored": true, - "fast": true + "type": "datetime" }, { - "name": "log_level", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "raw", + "name": "log_level", "record": "basic", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "raw", + "type": "text" }, { - "name": "message", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "default", + "name": "message", "record": "position", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "default", + "type": "text" } ], - "timestamp_field": "timestamp", + "index_field_presence": true, + "max_num_partitions": 100, + "mode": "dynamic", + "partition_key": "tenant_id", + "store_document_size": false, + "store_source": true, "tag_fields": [ "log_level", "tenant_id" ], - "partition_key": "tenant_id", - "max_num_partitions": 100, - "index_field_presence": true, - "store_document_size": false, - "store_source": true, + "timestamp_field": "timestamp", "tokenizers": [ { + "filters": [], "name": "custom_tokenizer", - "type": "regex", "pattern": "[^\\p{L}\\p{N}]+", - "filters": [] + "type": "regex" } ] }, + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", "indexing_settings": { "commit_timeout_secs": 301, - "docstore_compression_level": 8, "docstore_blocksize": 1000000, - "split_num_docs_target": 10000001, + "docstore_compression_level": 8, "merge_policy": { - "type": "stable_log", - "min_level_num_docs": 100000, - "merge_factor": 9, + "maturation_period": "2days", "max_merge_factor": 11, - "maturation_period": "2days" + "merge_factor": 9, + "min_level_num_docs": 100000, + "type": "stable_log" }, "resources": { "heap_size": "50.0 MB" - } + }, + "split_num_docs_target": 10000001 }, "ingest_settings": { "min_shards": 12 }, + "retention": { + "period": "90 days", + "schedule": "daily" + }, "search_settings": { "default_search_fields": [ "message" ] }, - "retention": { - "period": "90 days", - "schedule": "daily" - } - }, - "checkpoint": { - "kafka-source": { - "00000000000000000000": "00000000000000000042" - } + "version": "0.9" }, - "create_timestamp": 1789, + "index_uid": "my-index:00000000000000000000000001", "sources": [ { - "version": "0.9", - "source_id": "kafka-source", - "num_pipelines": 2, "enabled": true, - "source_type": "kafka", + "input_format": "json", + "num_pipelines": 2, "params": { - "topic": "kafka-topic", - "client_params": {} + "client_params": {}, + "topic": "kafka-topic" }, + "source_id": "kafka-source", + "source_type": "kafka", "transform": { "script": ".message = downcase(string!(.message))", "timezone": "UTC" }, - "input_format": "json" + "version": "0.9" + } + ], + "version": "0.9" + }, + "shards": { + "_ingest-source": [ + { + "doc_mapping_uid": "00000000000000000000000001", + "follower_id": "follower-ingester", + "index_uid": "my-index:00000000000000000000000001", + "leader_id": "leader-ingester", + "publish_position_inclusive": "", + "shard_id": "00000000000000000001", + "shard_state": 1, + "source_id": "_ingest-source", + "update_timestamp": 1724240908 } ] }, "splits": [ { - "split_state": "Published", - "update_timestamp": 1789, - "publish_timestamp": 1789, - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000001", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000001", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "publish_timestamp": 1789, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", + "split_state": "Published", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "update_timestamp": 1789, + "version": "0.9" } ], - "shards": { - "_ingest-source": [ - { - "index_uid": "my-index:00000000000000000000000001", - "source_id": "_ingest-source", - "shard_id": "00000000000000000001", - "leader_id": "leader-ingester", - "follower_id": "follower-ingester", - "shard_state": 1, - "publish_position_inclusive": "", - "doc_mapping_uid": "00000000000000000000000001", - "update_timestamp": 1724240908 - } - ] - }, - "delete_tasks": [ - { - "create_timestamp": 0, - "opstamp": 10, - "delete_query": { - "index_uid": "my-index:00000000000000000000000001", - "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false}]}" - } - } - ] + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/split-metadata/v0.7.expected.json b/quickwit/quickwit-metastore/test-data/split-metadata/v0.7.expected.json index 248baebc68e..fc54c8b931c 100644 --- a/quickwit/quickwit-metastore/test-data/split-metadata/v0.7.expected.json +++ b/quickwit/quickwit-metastore/test-data/split-metadata/v0.7.expected.json @@ -1,30 +1,31 @@ { - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000000", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000000", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/split-metadata/v0.8.expected.json b/quickwit/quickwit-metastore/test-data/split-metadata/v0.8.expected.json index 248baebc68e..fc54c8b931c 100644 --- a/quickwit/quickwit-metastore/test-data/split-metadata/v0.8.expected.json +++ b/quickwit/quickwit-metastore/test-data/split-metadata/v0.8.expected.json @@ -1,30 +1,31 @@ { - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000000", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000000", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.expected.json b/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.expected.json index 85bdfca81e0..3e2d37292d0 100644 --- a/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.expected.json +++ b/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.expected.json @@ -1,30 +1,31 @@ { - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000001", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000001", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.json b/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.json index 85bdfca81e0..3e2d37292d0 100644 --- a/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.json +++ b/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.json @@ -1,30 +1,31 @@ { - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000001", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000001", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "version": "0.9" } diff --git a/quickwit/quickwit-proto/protos/quickwit/control_plane.proto b/quickwit/quickwit-proto/protos/quickwit/control_plane.proto index d0850091280..6aa7d28d7da 100644 --- a/quickwit/quickwit-proto/protos/quickwit/control_plane.proto +++ b/quickwit/quickwit-proto/protos/quickwit/control_plane.proto @@ -69,6 +69,21 @@ service ControlPlaneService { // Performs a debounced shard pruning request to the metastore. rpc PruneShards(quickwit.metastore.PruneShardsRequest) returns (quickwit.metastore.EmptyResponse); + + // Swaps indexing pipelines of different indexes between different indexers. + rpc SwapIndexingPipelines(SwapIndexingPipelinesRequest) returns (SwapIndexingPipelinesResponse); + + // Maintenance Mode API + + // Enables maintenance mode on the cluster. When active, the indexing plan is frozen, + // metadata mutations (index/source CRUD) are accepted but the plan is not rebuilt, and shard scaling/rebalancing is paused. + rpc EnableMaintenanceMode(EnableMaintenanceModeRequest) returns (EnableMaintenanceModeResponse); + + // Disables maintenance mode. Triggers a full indexing plan rebuild to reconcile the cluster. + rpc DisableMaintenanceMode(DisableMaintenanceModeRequest) returns (DisableMaintenanceModeResponse); + + // Returns the current maintenance mode status. + rpc GetMaintenanceMode(GetMaintenanceModeRequest) returns (GetMaintenanceModeResponse); } // Shard API @@ -125,3 +140,59 @@ message AdviseResetShardsResponse { repeated quickwit.ingest.ShardIds shards_to_delete = 1; repeated quickwit.ingest.ShardIdPositions shards_to_truncate = 2; } + +message SwapIndexingPipelinesRequest { + repeated SwapIndexingPipelinesEntry swaps = 1; +} + +message SwapIndexingPipelinesEntry { + string left_node_id = 1; + string left_index_id = 2; + string right_node_id = 3; + optional string right_index_id = 4; +} + +message SwapIndexingPipelinesResponse { + repeated SwapIndexingPipelinesResult results = 1; +} + +message SwapIndexingPipelinesResult { + SwapIndexingPipelinesEntry swap = 1; + bool success = 2; + // Human-readable reason when success is false. + string reason = 3; +} + +// Maintenance Mode API + +message EnableMaintenanceModeRequest { +} + +message EnableMaintenanceModeResponse { + // The frozen physical indexing plan serialized as JSON. + string frozen_plan_json = 1; +} + +message DisableMaintenanceModeRequest { +} + +message DisableMaintenanceModeResponse { +} + +message GetMaintenanceModeRequest { +} + +message GetMaintenanceModeResponse { + bool is_maintenance_mode = 1; + // If maintenance mode is active, the RFC 3339 datetime string when it was enabled. + optional string enabled_at = 2; +} + +message MaintenanceFrozenPlanForNode { + string index_id = 1; + repeated quickwit.indexing.IndexingTask indexing_tasks = 2; +} + +message MaintenanceFrozenPlan { + repeated MaintenanceFrozenPlanForNode state_per_node = 2; +} \ No newline at end of file diff --git a/quickwit/quickwit-proto/protos/quickwit/metastore.proto b/quickwit/quickwit-proto/protos/quickwit/metastore.proto index 00680da02d0..97a1287068c 100644 --- a/quickwit/quickwit-proto/protos/quickwit/metastore.proto +++ b/quickwit/quickwit-proto/protos/quickwit/metastore.proto @@ -125,6 +125,9 @@ service MetastoreService { // Deletes splits. rpc DeleteSplits(DeleteSplitsRequest) returns (EmptyResponse); + // Soft-deletes individual documents within published splits. + rpc SoftDeleteDocuments(SoftDeleteDocumentsRequest) returns (SoftDeleteDocumentsResponse); + // Adds a source. rpc AddSource(AddSourceRequest) returns (EmptyResponse); @@ -202,6 +205,17 @@ service MetastoreService { // Deletes index templates. rpc DeleteIndexTemplates(DeleteIndexTemplatesRequest) returns (EmptyResponse); + // Key-Value API (for cluster-wide settings like maintenance mode) + + // Gets a value by key from the cluster-wide key-value store. + rpc GetKv(GetKvRequest) returns (GetKvResponse); + + // Sets a key-value pair in the cluster-wide key-value store. + rpc SetKv(SetKvRequest) returns (EmptyResponse); + + // Deletes a key from the cluster-wide key-value store. + rpc DeleteKv(DeleteKvRequest) returns (EmptyResponse); + // Get cluster identity rpc GetClusterIdentity(GetClusterIdentityRequest) returns (GetClusterIdentityResponse); } @@ -348,6 +362,20 @@ message DeleteSplitsRequest { repeated string split_ids = 3; } +message SplitDocIds { + string split_id = 1; + repeated uint32 doc_ids = 2; +} + +message SoftDeleteDocumentsRequest { + quickwit.common.IndexUid index_uid = 1; + repeated SplitDocIds split_doc_ids = 2; +} + +message SoftDeleteDocumentsResponse { + uint64 num_soft_deleted_doc_ids = 1; +} + message AddSourceRequest { quickwit.common.IndexUid index_uid = 1; string source_config_json = 2; @@ -561,6 +589,26 @@ message DeleteIndexTemplatesRequest { repeated string template_ids = 1; } +// Key-Value API + +message GetKvRequest { + string key = 1; +} + +message GetKvResponse { + // Empty if the key does not exist. + optional string value = 1; +} + +message SetKvRequest { + string key = 1; + string value = 2; +} + +message DeleteKvRequest { + string key = 1; +} + message GetClusterIdentityRequest { } diff --git a/quickwit/quickwit-proto/protos/quickwit/search.proto b/quickwit/quickwit-proto/protos/quickwit/search.proto index 7b543e9ed25..f6acff2877b 100644 --- a/quickwit/quickwit-proto/protos/quickwit/search.proto +++ b/quickwit/quickwit-proto/protos/quickwit/search.proto @@ -37,6 +37,11 @@ service SearchService { /// This methods takes `PartialHit`s and returns `Hit`s. rpc FetchDocs(FetchDocsRequest) returns (FetchDocsResponse); + // Streams document contents from the document store. + // This method takes `PartialHit`s and streams back `LeafHit`s in batches + // to avoid hitting gRPC message size limits. + rpc StreamFetchDocs(FetchDocsRequest) returns (stream FetchDocsResponse); + // Root list terms API. // This RPC identifies the set of splits on which the query should run on, // and dispatches the several calls to `LeafListTerms`. @@ -249,6 +254,9 @@ message SearchRequest { bool ignore_missing_indexes = 18; optional string split_id = 19; + + // The user agent of the client that initiated the search request. + optional string user_agent = 20; } enum CountHits { @@ -264,7 +272,7 @@ message SortField { SortOrder sort_order = 2; // Optional sort value format for datetime field only. // If none, the default output format for datetime field is - // unix_timestamp_nanos. + // unix_timestamp_millis. optional SortDatetimeFormat sort_datetime_format = 3; } @@ -292,8 +300,8 @@ message SearchResponse { // server-side and expressed in microseconds. uint64 elapsed_time_micros = 3; - // The searcherrors that occurred formatted as string. - repeated string errors = 4; + // deprecated `errors` field + reserved 4; // used to be json-encoded aggregation reserved 5; @@ -386,6 +394,8 @@ message SplitIdAndFooterOffsets { optional int64 timestamp_end = 5; // The number of docs in the split uint64 num_docs = 6; + // Tantivy doc IDs that have been soft-deleted from this split + repeated uint32 soft_deleted_doc_ids = 7; } // Hits returned by a FetchDocRequest. @@ -461,9 +471,11 @@ message SortByValue { int64 i64 = 2; double f64 = 3; bool boolean = 4; + string str = 5; + int64 datetime = 6; } // Room for eventual future sorted key types. - reserved 5 to 20; + reserved 7 to 20; } message LeafSearchResponse { diff --git a/quickwit/quickwit-proto/src/codegen/jaeger/opentelemetry.proto.trace.v1.rs b/quickwit/quickwit-proto/src/codegen/jaeger/opentelemetry.proto.trace.v1.rs index 6736d97c7e2..afa08ca3c9d 100644 --- a/quickwit/quickwit-proto/src/codegen/jaeger/opentelemetry.proto.trace.v1.rs +++ b/quickwit/quickwit-proto/src/codegen/jaeger/opentelemetry.proto.trace.v1.rs @@ -120,10 +120,12 @@ pub struct Span { /// attributes is a collection of key/value pairs. Note, global attributes /// like server name can be set using the resource API. Examples of attributes: /// - /// "/http/user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36" - /// "/http/server_latency": 300 - /// "abc.com/myattribute": true - /// "abc.com/score": 10.239 + /// ```text + /// "/http/user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36" + /// "/http/server_latency": 300 + /// "abc.com/myattribute": true + /// "abc.com/score": 10.239 + /// ``` /// /// The OpenTelemetry API specification further restricts the allowed value types: /// @@ -276,7 +278,7 @@ pub mod span { } /// The Status type defines a logical error model that is suitable for different /// programming environments, including REST APIs and RPC APIs. -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Status { /// A developer-facing human readable error message. #[prost(string, tag = "2")] diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.control_plane.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.control_plane.rs index 09cfbdebf58..6c1a3e98600 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.control_plane.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.control_plane.rs @@ -73,6 +73,83 @@ pub struct AdviseResetShardsResponse { pub shards_to_truncate: ::prost::alloc::vec::Vec, } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct SwapIndexingPipelinesRequest { + #[prost(message, repeated, tag = "1")] + pub swaps: ::prost::alloc::vec::Vec, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct SwapIndexingPipelinesEntry { + #[prost(string, tag = "1")] + pub left_node_id: ::prost::alloc::string::String, + #[prost(string, tag = "2")] + pub left_index_id: ::prost::alloc::string::String, + #[prost(string, tag = "3")] + pub right_node_id: ::prost::alloc::string::String, + #[prost(string, optional, tag = "4")] + pub right_index_id: ::core::option::Option<::prost::alloc::string::String>, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct SwapIndexingPipelinesResponse { + #[prost(message, repeated, tag = "1")] + pub results: ::prost::alloc::vec::Vec, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct SwapIndexingPipelinesResult { + #[prost(message, optional, tag = "1")] + pub swap: ::core::option::Option, + #[prost(bool, tag = "2")] + pub success: bool, + /// Human-readable reason when success is false. + #[prost(string, tag = "3")] + pub reason: ::prost::alloc::string::String, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] +pub struct EnableMaintenanceModeRequest {} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct EnableMaintenanceModeResponse { + /// The frozen physical indexing plan serialized as JSON. + #[prost(string, tag = "1")] + pub frozen_plan_json: ::prost::alloc::string::String, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] +pub struct DisableMaintenanceModeRequest {} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] +pub struct DisableMaintenanceModeResponse {} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] +pub struct GetMaintenanceModeRequest {} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct GetMaintenanceModeResponse { + #[prost(bool, tag = "1")] + pub is_maintenance_mode: bool, + /// If maintenance mode is active, the RFC 3339 datetime string when it was enabled. + #[prost(string, optional, tag = "2")] + pub enabled_at: ::core::option::Option<::prost::alloc::string::String>, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct MaintenanceFrozenPlanForNode { + #[prost(string, tag = "1")] + pub index_id: ::prost::alloc::string::String, + #[prost(message, repeated, tag = "2")] + pub indexing_tasks: ::prost::alloc::vec::Vec, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct MaintenanceFrozenPlan { + #[prost(message, repeated, tag = "2")] + pub state_per_node: ::prost::alloc::vec::Vec, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[serde(rename_all = "snake_case")] #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -180,6 +257,27 @@ pub trait ControlPlaneService: std::fmt::Debug + Send + Sync + 'static { &self, request: super::metastore::PruneShardsRequest, ) -> crate::control_plane::ControlPlaneResult; + ///Swaps indexing pipelines of different indexes between different indexers. + async fn swap_indexing_pipelines( + &self, + request: SwapIndexingPipelinesRequest, + ) -> crate::control_plane::ControlPlaneResult; + ///Enables maintenance mode on the cluster. When active, the indexing plan is frozen, + ///metadata mutations (index/source CRUD) are accepted but the plan is not rebuilt, and shard scaling/rebalancing is paused. + async fn enable_maintenance_mode( + &self, + request: EnableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult; + ///Disables maintenance mode. Triggers a full indexing plan rebuild to reconcile the cluster. + async fn disable_maintenance_mode( + &self, + request: DisableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult; + ///Returns the current maintenance mode status. + async fn get_maintenance_mode( + &self, + request: GetMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult; } #[derive(Debug, Clone)] pub struct ControlPlaneServiceClient { @@ -352,6 +450,30 @@ impl ControlPlaneService for ControlPlaneServiceClient { ) -> crate::control_plane::ControlPlaneResult { self.inner.0.prune_shards(request).await } + async fn swap_indexing_pipelines( + &self, + request: SwapIndexingPipelinesRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.inner.0.swap_indexing_pipelines(request).await + } + async fn enable_maintenance_mode( + &self, + request: EnableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.inner.0.enable_maintenance_mode(request).await + } + async fn disable_maintenance_mode( + &self, + request: DisableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.inner.0.disable_maintenance_mode(request).await + } + async fn get_maintenance_mode( + &self, + request: GetMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.inner.0.get_maintenance_mode(request).await + } } #[cfg(any(test, feature = "testsuite"))] pub mod mock_control_plane_service { @@ -440,6 +562,38 @@ pub mod mock_control_plane_service { > { self.inner.lock().await.prune_shards(request).await } + async fn swap_indexing_pipelines( + &self, + request: super::SwapIndexingPipelinesRequest, + ) -> crate::control_plane::ControlPlaneResult< + super::SwapIndexingPipelinesResponse, + > { + self.inner.lock().await.swap_indexing_pipelines(request).await + } + async fn enable_maintenance_mode( + &self, + request: super::EnableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult< + super::EnableMaintenanceModeResponse, + > { + self.inner.lock().await.enable_maintenance_mode(request).await + } + async fn disable_maintenance_mode( + &self, + request: super::DisableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult< + super::DisableMaintenanceModeResponse, + > { + self.inner.lock().await.disable_maintenance_mode(request).await + } + async fn get_maintenance_mode( + &self, + request: super::GetMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult< + super::GetMaintenanceModeResponse, + > { + self.inner.lock().await.get_maintenance_mode(request).await + } } } pub type BoxFuture = std::pin::Pin< @@ -613,6 +767,70 @@ for InnerControlPlaneServiceClient { Box::pin(fut) } } +impl tower::Service for InnerControlPlaneServiceClient { + type Response = SwapIndexingPipelinesResponse; + type Error = crate::control_plane::ControlPlaneError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: SwapIndexingPipelinesRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.swap_indexing_pipelines(request).await }; + Box::pin(fut) + } +} +impl tower::Service for InnerControlPlaneServiceClient { + type Response = EnableMaintenanceModeResponse; + type Error = crate::control_plane::ControlPlaneError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: EnableMaintenanceModeRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.enable_maintenance_mode(request).await }; + Box::pin(fut) + } +} +impl tower::Service for InnerControlPlaneServiceClient { + type Response = DisableMaintenanceModeResponse; + type Error = crate::control_plane::ControlPlaneError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: DisableMaintenanceModeRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.disable_maintenance_mode(request).await }; + Box::pin(fut) + } +} +impl tower::Service for InnerControlPlaneServiceClient { + type Response = GetMaintenanceModeResponse; + type Error = crate::control_plane::ControlPlaneError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: GetMaintenanceModeRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.get_maintenance_mode(request).await }; + Box::pin(fut) + } +} /// A tower service stack is a set of tower services. #[derive(Debug)] struct ControlPlaneServiceTowerServiceStack { @@ -668,6 +886,26 @@ struct ControlPlaneServiceTowerServiceStack { super::metastore::EmptyResponse, crate::control_plane::ControlPlaneError, >, + swap_indexing_pipelines_svc: quickwit_common::tower::BoxService< + SwapIndexingPipelinesRequest, + SwapIndexingPipelinesResponse, + crate::control_plane::ControlPlaneError, + >, + enable_maintenance_mode_svc: quickwit_common::tower::BoxService< + EnableMaintenanceModeRequest, + EnableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + disable_maintenance_mode_svc: quickwit_common::tower::BoxService< + DisableMaintenanceModeRequest, + DisableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + get_maintenance_mode_svc: quickwit_common::tower::BoxService< + GetMaintenanceModeRequest, + GetMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, } #[async_trait::async_trait] impl ControlPlaneService for ControlPlaneServiceTowerServiceStack { @@ -735,6 +973,30 @@ impl ControlPlaneService for ControlPlaneServiceTowerServiceStack { ) -> crate::control_plane::ControlPlaneResult { self.prune_shards_svc.clone().ready().await?.call(request).await } + async fn swap_indexing_pipelines( + &self, + request: SwapIndexingPipelinesRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.swap_indexing_pipelines_svc.clone().ready().await?.call(request).await + } + async fn enable_maintenance_mode( + &self, + request: EnableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.enable_maintenance_mode_svc.clone().ready().await?.call(request).await + } + async fn disable_maintenance_mode( + &self, + request: DisableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.disable_maintenance_mode_svc.clone().ready().await?.call(request).await + } + async fn get_maintenance_mode( + &self, + request: GetMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.get_maintenance_mode_svc.clone().ready().await?.call(request).await + } } type CreateIndexLayer = quickwit_common::tower::BoxLayer< quickwit_common::tower::BoxService< @@ -836,6 +1098,46 @@ type PruneShardsLayer = quickwit_common::tower::BoxLayer< super::metastore::EmptyResponse, crate::control_plane::ControlPlaneError, >; +type SwapIndexingPipelinesLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + SwapIndexingPipelinesRequest, + SwapIndexingPipelinesResponse, + crate::control_plane::ControlPlaneError, + >, + SwapIndexingPipelinesRequest, + SwapIndexingPipelinesResponse, + crate::control_plane::ControlPlaneError, +>; +type EnableMaintenanceModeLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + EnableMaintenanceModeRequest, + EnableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + EnableMaintenanceModeRequest, + EnableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, +>; +type DisableMaintenanceModeLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + DisableMaintenanceModeRequest, + DisableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + DisableMaintenanceModeRequest, + DisableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, +>; +type GetMaintenanceModeLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + GetMaintenanceModeRequest, + GetMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + GetMaintenanceModeRequest, + GetMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, +>; #[derive(Debug, Default)] pub struct ControlPlaneServiceTowerLayerStack { create_index_layers: Vec, @@ -848,6 +1150,10 @@ pub struct ControlPlaneServiceTowerLayerStack { get_or_create_open_shards_layers: Vec, advise_reset_shards_layers: Vec, prune_shards_layers: Vec, + swap_indexing_pipelines_layers: Vec, + enable_maintenance_mode_layers: Vec, + disable_maintenance_mode_layers: Vec, + get_maintenance_mode_layers: Vec, } impl ControlPlaneServiceTowerLayerStack { pub fn stack_layer(mut self, layer: L) -> Self @@ -1120,6 +1426,114 @@ impl ControlPlaneServiceTowerLayerStack { >>::Service as tower::Service< super::metastore::PruneShardsRequest, >>::Future: Send + 'static, + L: tower::Layer< + quickwit_common::tower::BoxService< + SwapIndexingPipelinesRequest, + SwapIndexingPipelinesResponse, + crate::control_plane::ControlPlaneError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + SwapIndexingPipelinesRequest, + Response = SwapIndexingPipelinesResponse, + Error = crate::control_plane::ControlPlaneError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service< + SwapIndexingPipelinesRequest, + >>::Future: Send + 'static, + L: tower::Layer< + quickwit_common::tower::BoxService< + EnableMaintenanceModeRequest, + EnableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + EnableMaintenanceModeRequest, + Response = EnableMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service< + EnableMaintenanceModeRequest, + >>::Future: Send + 'static, + L: tower::Layer< + quickwit_common::tower::BoxService< + DisableMaintenanceModeRequest, + DisableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + DisableMaintenanceModeRequest, + Response = DisableMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service< + DisableMaintenanceModeRequest, + >>::Future: Send + 'static, + L: tower::Layer< + quickwit_common::tower::BoxService< + GetMaintenanceModeRequest, + GetMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + GetMaintenanceModeRequest, + Response = GetMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service< + GetMaintenanceModeRequest, + >>::Future: Send + 'static, { self.create_index_layers .push(quickwit_common::tower::BoxLayer::new(layer.clone())); @@ -1141,6 +1555,14 @@ impl ControlPlaneServiceTowerLayerStack { .push(quickwit_common::tower::BoxLayer::new(layer.clone())); self.prune_shards_layers .push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self.swap_indexing_pipelines_layers + .push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self.enable_maintenance_mode_layers + .push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self.disable_maintenance_mode_layers + .push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self.get_maintenance_mode_layers + .push(quickwit_common::tower::BoxLayer::new(layer.clone())); self } pub fn stack_create_index_layer(mut self, layer: L) -> Self @@ -1353,28 +1775,116 @@ impl ControlPlaneServiceTowerLayerStack { self.prune_shards_layers.push(quickwit_common::tower::BoxLayer::new(layer)); self } - pub fn build(self, instance: T) -> ControlPlaneServiceClient + pub fn stack_swap_indexing_pipelines_layer(mut self, layer: L) -> Self where - T: ControlPlaneService, + L: tower::Layer< + quickwit_common::tower::BoxService< + SwapIndexingPipelinesRequest, + SwapIndexingPipelinesResponse, + crate::control_plane::ControlPlaneError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + SwapIndexingPipelinesRequest, + Response = SwapIndexingPipelinesResponse, + Error = crate::control_plane::ControlPlaneError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, { - let inner_client = InnerControlPlaneServiceClient(std::sync::Arc::new(instance)); - self.build_from_inner_client(inner_client) - } - pub fn build_from_channel( - self, - addr: std::net::SocketAddr, - channel: tonic::transport::Channel, - max_message_size: bytesize::ByteSize, - compression_encoding_opt: Option, - ) -> ControlPlaneServiceClient { - let client = ControlPlaneServiceClient::from_channel( - addr, - channel, - max_message_size, - compression_encoding_opt, - ); - let inner_client = client.inner; - self.build_from_inner_client(inner_client) + self.swap_indexing_pipelines_layers + .push(quickwit_common::tower::BoxLayer::new(layer)); + self + } + pub fn stack_enable_maintenance_mode_layer(mut self, layer: L) -> Self + where + L: tower::Layer< + quickwit_common::tower::BoxService< + EnableMaintenanceModeRequest, + EnableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + EnableMaintenanceModeRequest, + Response = EnableMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, + { + self.enable_maintenance_mode_layers + .push(quickwit_common::tower::BoxLayer::new(layer)); + self + } + pub fn stack_disable_maintenance_mode_layer(mut self, layer: L) -> Self + where + L: tower::Layer< + quickwit_common::tower::BoxService< + DisableMaintenanceModeRequest, + DisableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + DisableMaintenanceModeRequest, + Response = DisableMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, + { + self.disable_maintenance_mode_layers + .push(quickwit_common::tower::BoxLayer::new(layer)); + self + } + pub fn stack_get_maintenance_mode_layer(mut self, layer: L) -> Self + where + L: tower::Layer< + quickwit_common::tower::BoxService< + GetMaintenanceModeRequest, + GetMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + GetMaintenanceModeRequest, + Response = GetMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, + { + self.get_maintenance_mode_layers + .push(quickwit_common::tower::BoxLayer::new(layer)); + self + } + pub fn build(self, instance: T) -> ControlPlaneServiceClient + where + T: ControlPlaneService, + { + let inner_client = InnerControlPlaneServiceClient(std::sync::Arc::new(instance)); + self.build_from_inner_client(inner_client) + } + pub fn build_from_channel( + self, + addr: std::net::SocketAddr, + channel: tonic::transport::Channel, + max_message_size: bytesize::ByteSize, + compression_encoding_opt: Option, + ) -> ControlPlaneServiceClient { + let client = ControlPlaneServiceClient::from_channel( + addr, + channel, + max_message_size, + compression_encoding_opt, + ); + let inner_client = client.inner; + self.build_from_inner_client(inner_client) } pub fn build_from_balance_channel( self, @@ -1496,6 +2006,38 @@ impl ControlPlaneServiceTowerLayerStack { quickwit_common::tower::BoxService::new(inner_client.clone()), |svc, layer| layer.layer(svc), ); + let swap_indexing_pipelines_svc = self + .swap_indexing_pipelines_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); + let enable_maintenance_mode_svc = self + .enable_maintenance_mode_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); + let disable_maintenance_mode_svc = self + .disable_maintenance_mode_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); + let get_maintenance_mode_svc = self + .get_maintenance_mode_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); let tower_svc_stack = ControlPlaneServiceTowerServiceStack { inner: inner_client, create_index_svc, @@ -1508,6 +2050,10 @@ impl ControlPlaneServiceTowerLayerStack { get_or_create_open_shards_svc, advise_reset_shards_svc, prune_shards_svc, + swap_indexing_pipelines_svc, + enable_maintenance_mode_svc, + disable_maintenance_mode_svc, + get_maintenance_mode_svc, }; ControlPlaneServiceClient::new(tower_svc_stack) } @@ -1673,6 +2219,42 @@ where super::metastore::EmptyResponse, crate::control_plane::ControlPlaneError, >, + > + + tower::Service< + SwapIndexingPipelinesRequest, + Response = SwapIndexingPipelinesResponse, + Error = crate::control_plane::ControlPlaneError, + Future = BoxFuture< + SwapIndexingPipelinesResponse, + crate::control_plane::ControlPlaneError, + >, + > + + tower::Service< + EnableMaintenanceModeRequest, + Response = EnableMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + Future = BoxFuture< + EnableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + > + + tower::Service< + DisableMaintenanceModeRequest, + Response = DisableMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + Future = BoxFuture< + DisableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + > + + tower::Service< + GetMaintenanceModeRequest, + Response = GetMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + Future = BoxFuture< + GetMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, >, { async fn create_index( @@ -1739,6 +2321,30 @@ where ) -> crate::control_plane::ControlPlaneResult { self.clone().call(request).await } + async fn swap_indexing_pipelines( + &self, + request: SwapIndexingPipelinesRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.clone().call(request).await + } + async fn enable_maintenance_mode( + &self, + request: EnableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.clone().call(request).await + } + async fn disable_maintenance_mode( + &self, + request: DisableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.clone().call(request).await + } + async fn get_maintenance_mode( + &self, + request: GetMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.clone().call(request).await + } } #[derive(Debug, Clone)] pub struct ControlPlaneServiceGrpcClientAdapter { @@ -1918,6 +2524,62 @@ where super::metastore::PruneShardsRequest::rpc_name(), )) } + async fn swap_indexing_pipelines( + &self, + request: SwapIndexingPipelinesRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.inner + .clone() + .swap_indexing_pipelines(request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + SwapIndexingPipelinesRequest::rpc_name(), + )) + } + async fn enable_maintenance_mode( + &self, + request: EnableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.inner + .clone() + .enable_maintenance_mode(request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + EnableMaintenanceModeRequest::rpc_name(), + )) + } + async fn disable_maintenance_mode( + &self, + request: DisableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.inner + .clone() + .disable_maintenance_mode(request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + DisableMaintenanceModeRequest::rpc_name(), + )) + } + async fn get_maintenance_mode( + &self, + request: GetMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.inner + .clone() + .get_maintenance_mode(request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + GetMaintenanceModeRequest::rpc_name(), + )) + } } #[derive(Debug)] pub struct ControlPlaneServiceGrpcServerAdapter { @@ -2049,6 +2711,50 @@ for ControlPlaneServiceGrpcServerAdapter { .map(tonic::Response::new) .map_err(crate::error::grpc_error_to_grpc_status) } + async fn swap_indexing_pipelines( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner + .0 + .swap_indexing_pipelines(request.into_inner()) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + } + async fn enable_maintenance_mode( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner + .0 + .enable_maintenance_mode(request.into_inner()) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + } + async fn disable_maintenance_mode( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner + .0 + .disable_maintenance_mode(request.into_inner()) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + } + async fn get_maintenance_mode( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner + .0 + .get_maintenance_mode(request.into_inner()) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + } } /// Generated client implementations. pub mod control_plane_service_grpc_client { @@ -2450,6 +3156,127 @@ pub mod control_plane_service_grpc_client { ); self.inner.unary(req, path, codec).await } + /// Swaps indexing pipelines of different indexes between different indexers. + pub async fn swap_indexing_pipelines( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.control_plane.ControlPlaneService/SwapIndexingPipelines", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "quickwit.control_plane.ControlPlaneService", + "SwapIndexingPipelines", + ), + ); + self.inner.unary(req, path, codec).await + } + /// Enables maintenance mode on the cluster. When active, the indexing plan is frozen, + /// metadata mutations (index/source CRUD) are accepted but the plan is not rebuilt, and shard scaling/rebalancing is paused. + pub async fn enable_maintenance_mode( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.control_plane.ControlPlaneService/EnableMaintenanceMode", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "quickwit.control_plane.ControlPlaneService", + "EnableMaintenanceMode", + ), + ); + self.inner.unary(req, path, codec).await + } + /// Disables maintenance mode. Triggers a full indexing plan rebuild to reconcile the cluster. + pub async fn disable_maintenance_mode( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.control_plane.ControlPlaneService/DisableMaintenanceMode", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "quickwit.control_plane.ControlPlaneService", + "DisableMaintenanceMode", + ), + ); + self.inner.unary(req, path, codec).await + } + /// Returns the current maintenance mode status. + pub async fn get_maintenance_mode( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.control_plane.ControlPlaneService/GetMaintenanceMode", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "quickwit.control_plane.ControlPlaneService", + "GetMaintenanceMode", + ), + ); + self.inner.unary(req, path, codec).await + } } } /// Generated server implementations. @@ -2546,6 +3373,39 @@ pub mod control_plane_service_grpc_server { tonic::Response, tonic::Status, >; + /// Swaps indexing pipelines of different indexes between different indexers. + async fn swap_indexing_pipelines( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; + /// Enables maintenance mode on the cluster. When active, the indexing plan is frozen, + /// metadata mutations (index/source CRUD) are accepted but the plan is not rebuilt, and shard scaling/rebalancing is paused. + async fn enable_maintenance_mode( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; + /// Disables maintenance mode. Triggers a full indexing plan rebuild to reconcile the cluster. + async fn disable_maintenance_mode( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; + /// Returns the current maintenance mode status. + async fn get_maintenance_mode( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; } #[derive(Debug)] pub struct ControlPlaneServiceGrpcServer { @@ -3137,6 +3997,208 @@ pub mod control_plane_service_grpc_server { }; Box::pin(fut) } + "/quickwit.control_plane.ControlPlaneService/SwapIndexingPipelines" => { + #[allow(non_camel_case_types)] + struct SwapIndexingPipelinesSvc( + pub Arc, + ); + impl< + T: ControlPlaneServiceGrpc, + > tonic::server::UnaryService + for SwapIndexingPipelinesSvc { + type Response = super::SwapIndexingPipelinesResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::swap_indexing_pipelines( + &inner, + request, + ) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = SwapIndexingPipelinesSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/quickwit.control_plane.ControlPlaneService/EnableMaintenanceMode" => { + #[allow(non_camel_case_types)] + struct EnableMaintenanceModeSvc( + pub Arc, + ); + impl< + T: ControlPlaneServiceGrpc, + > tonic::server::UnaryService + for EnableMaintenanceModeSvc { + type Response = super::EnableMaintenanceModeResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::enable_maintenance_mode( + &inner, + request, + ) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = EnableMaintenanceModeSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/quickwit.control_plane.ControlPlaneService/DisableMaintenanceMode" => { + #[allow(non_camel_case_types)] + struct DisableMaintenanceModeSvc( + pub Arc, + ); + impl< + T: ControlPlaneServiceGrpc, + > tonic::server::UnaryService + for DisableMaintenanceModeSvc { + type Response = super::DisableMaintenanceModeResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::disable_maintenance_mode( + &inner, + request, + ) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = DisableMaintenanceModeSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/quickwit.control_plane.ControlPlaneService/GetMaintenanceMode" => { + #[allow(non_camel_case_types)] + struct GetMaintenanceModeSvc(pub Arc); + impl< + T: ControlPlaneServiceGrpc, + > tonic::server::UnaryService + for GetMaintenanceModeSvc { + type Response = super::GetMaintenanceModeResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::get_maintenance_mode( + &inner, + request, + ) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = GetMaintenanceModeSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } _ => { Box::pin(async move { let mut response = http::Response::new( diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs index ab6d1ddc236..267cc8aac31 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs @@ -210,6 +210,28 @@ pub struct DeleteSplitsRequest { } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct SplitDocIds { + #[prost(string, tag = "1")] + pub split_id: ::prost::alloc::string::String, + #[prost(uint32, repeated, tag = "2")] + pub doc_ids: ::prost::alloc::vec::Vec, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct SoftDeleteDocumentsRequest { + #[prost(message, optional, tag = "1")] + pub index_uid: ::core::option::Option, + #[prost(message, repeated, tag = "2")] + pub split_doc_ids: ::prost::alloc::vec::Vec, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] +pub struct SoftDeleteDocumentsResponse { + #[prost(uint64, tag = "1")] + pub num_soft_deleted_doc_ids: u64, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct AddSourceRequest { #[prost(message, optional, tag = "1")] pub index_uid: ::core::option::Option, @@ -524,6 +546,33 @@ pub struct DeleteIndexTemplatesRequest { pub template_ids: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct GetKvRequest { + #[prost(string, tag = "1")] + pub key: ::prost::alloc::string::String, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct GetKvResponse { + /// Empty if the key does not exist. + #[prost(string, optional, tag = "1")] + pub value: ::core::option::Option<::prost::alloc::string::String>, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct SetKvRequest { + #[prost(string, tag = "1")] + pub key: ::prost::alloc::string::String, + #[prost(string, tag = "2")] + pub value: ::prost::alloc::string::String, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct DeleteKvRequest { + #[prost(string, tag = "1")] + pub key: ::prost::alloc::string::String, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct GetClusterIdentityRequest {} #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] @@ -693,6 +742,11 @@ impl RpcName for DeleteSplitsRequest { "delete_splits" } } +impl RpcName for SoftDeleteDocumentsRequest { + fn rpc_name() -> &'static str { + "soft_delete_documents" + } +} impl RpcName for AddSourceRequest { fn rpc_name() -> &'static str { "add_source" @@ -793,6 +847,21 @@ impl RpcName for DeleteIndexTemplatesRequest { "delete_index_templates" } } +impl RpcName for GetKvRequest { + fn rpc_name() -> &'static str { + "get_kv" + } +} +impl RpcName for SetKvRequest { + fn rpc_name() -> &'static str { + "set_kv" + } +} +impl RpcName for DeleteKvRequest { + fn rpc_name() -> &'static str { + "delete_kv" + } +} impl RpcName for GetClusterIdentityRequest { fn rpc_name() -> &'static str { "get_cluster_identity" @@ -867,6 +936,11 @@ pub trait MetastoreService: std::fmt::Debug + Send + Sync + 'static { &self, request: DeleteSplitsRequest, ) -> crate::metastore::MetastoreResult; + ///Soft-deletes individual documents within published splits. + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult; ///Adds a source. async fn add_source( &self, @@ -980,6 +1054,21 @@ pub trait MetastoreService: std::fmt::Debug + Send + Sync + 'static { &self, request: DeleteIndexTemplatesRequest, ) -> crate::metastore::MetastoreResult; + ///Gets a value by key from the cluster-wide key-value store. + async fn get_kv( + &self, + request: GetKvRequest, + ) -> crate::metastore::MetastoreResult; + ///Sets a key-value pair in the cluster-wide key-value store. + async fn set_kv( + &self, + request: SetKvRequest, + ) -> crate::metastore::MetastoreResult; + ///Deletes a key from the cluster-wide key-value store. + async fn delete_kv( + &self, + request: DeleteKvRequest, + ) -> crate::metastore::MetastoreResult; ///Get cluster identity async fn get_cluster_identity( &self, @@ -1167,6 +1256,12 @@ impl MetastoreService for MetastoreServiceClient { ) -> crate::metastore::MetastoreResult { self.inner.0.delete_splits(request).await } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.0.soft_delete_documents(request).await + } async fn add_source( &self, request: AddSourceRequest, @@ -1287,6 +1382,24 @@ impl MetastoreService for MetastoreServiceClient { ) -> crate::metastore::MetastoreResult { self.inner.0.delete_index_templates(request).await } + async fn get_kv( + &self, + request: GetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.0.get_kv(request).await + } + async fn set_kv( + &self, + request: SetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.0.set_kv(request).await + } + async fn delete_kv( + &self, + request: DeleteKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.0.delete_kv(request).await + } async fn get_cluster_identity( &self, request: GetClusterIdentityRequest, @@ -1383,6 +1496,12 @@ pub mod mock_metastore_service { ) -> crate::metastore::MetastoreResult { self.inner.lock().await.delete_splits(request).await } + async fn soft_delete_documents( + &self, + request: super::SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.lock().await.soft_delete_documents(request).await + } async fn add_source( &self, request: super::AddSourceRequest, @@ -1505,6 +1624,24 @@ pub mod mock_metastore_service { ) -> crate::metastore::MetastoreResult { self.inner.lock().await.delete_index_templates(request).await } + async fn get_kv( + &self, + request: super::GetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.lock().await.get_kv(request).await + } + async fn set_kv( + &self, + request: super::SetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.lock().await.set_kv(request).await + } + async fn delete_kv( + &self, + request: super::DeleteKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.lock().await.delete_kv(request).await + } async fn get_cluster_identity( &self, request: super::GetClusterIdentityRequest, @@ -1714,6 +1851,22 @@ impl tower::Service for InnerMetastoreServiceClient { Box::pin(fut) } } +impl tower::Service for InnerMetastoreServiceClient { + type Response = SoftDeleteDocumentsResponse; + type Error = crate::metastore::MetastoreError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: SoftDeleteDocumentsRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.soft_delete_documents(request).await }; + Box::pin(fut) + } +} impl tower::Service for InnerMetastoreServiceClient { type Response = EmptyResponse; type Error = crate::metastore::MetastoreError; @@ -2034,6 +2187,54 @@ impl tower::Service for InnerMetastoreServiceClient Box::pin(fut) } } +impl tower::Service for InnerMetastoreServiceClient { + type Response = GetKvResponse; + type Error = crate::metastore::MetastoreError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: GetKvRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.get_kv(request).await }; + Box::pin(fut) + } +} +impl tower::Service for InnerMetastoreServiceClient { + type Response = EmptyResponse; + type Error = crate::metastore::MetastoreError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: SetKvRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.set_kv(request).await }; + Box::pin(fut) + } +} +impl tower::Service for InnerMetastoreServiceClient { + type Response = EmptyResponse; + type Error = crate::metastore::MetastoreError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: DeleteKvRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.delete_kv(request).await }; + Box::pin(fut) + } +} impl tower::Service for InnerMetastoreServiceClient { type Response = GetClusterIdentityResponse; type Error = crate::metastore::MetastoreError; @@ -2115,6 +2316,11 @@ struct MetastoreServiceTowerServiceStack { EmptyResponse, crate::metastore::MetastoreError, >, + soft_delete_documents_svc: quickwit_common::tower::BoxService< + SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, + >, add_source_svc: quickwit_common::tower::BoxService< AddSourceRequest, EmptyResponse, @@ -2215,6 +2421,21 @@ struct MetastoreServiceTowerServiceStack { EmptyResponse, crate::metastore::MetastoreError, >, + get_kv_svc: quickwit_common::tower::BoxService< + GetKvRequest, + GetKvResponse, + crate::metastore::MetastoreError, + >, + set_kv_svc: quickwit_common::tower::BoxService< + SetKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, + >, + delete_kv_svc: quickwit_common::tower::BoxService< + DeleteKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, + >, get_cluster_identity_svc: quickwit_common::tower::BoxService< GetClusterIdentityRequest, GetClusterIdentityResponse, @@ -2295,6 +2516,12 @@ impl MetastoreService for MetastoreServiceTowerServiceStack { ) -> crate::metastore::MetastoreResult { self.delete_splits_svc.clone().ready().await?.call(request).await } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult { + self.soft_delete_documents_svc.clone().ready().await?.call(request).await + } async fn add_source( &self, request: AddSourceRequest, @@ -2415,6 +2642,24 @@ impl MetastoreService for MetastoreServiceTowerServiceStack { ) -> crate::metastore::MetastoreResult { self.delete_index_templates_svc.clone().ready().await?.call(request).await } + async fn get_kv( + &self, + request: GetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.get_kv_svc.clone().ready().await?.call(request).await + } + async fn set_kv( + &self, + request: SetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.set_kv_svc.clone().ready().await?.call(request).await + } + async fn delete_kv( + &self, + request: DeleteKvRequest, + ) -> crate::metastore::MetastoreResult { + self.delete_kv_svc.clone().ready().await?.call(request).await + } async fn get_cluster_identity( &self, request: GetClusterIdentityRequest, @@ -2548,6 +2793,16 @@ type DeleteSplitsLayer = quickwit_common::tower::BoxLayer< EmptyResponse, crate::metastore::MetastoreError, >; +type SoftDeleteDocumentsLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, + >, + SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, +>; type AddSourceLayer = quickwit_common::tower::BoxLayer< quickwit_common::tower::BoxService< AddSourceRequest, @@ -2748,6 +3003,36 @@ type DeleteIndexTemplatesLayer = quickwit_common::tower::BoxLayer< EmptyResponse, crate::metastore::MetastoreError, >; +type GetKvLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + GetKvRequest, + GetKvResponse, + crate::metastore::MetastoreError, + >, + GetKvRequest, + GetKvResponse, + crate::metastore::MetastoreError, +>; +type SetKvLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + SetKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, + >, + SetKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, +>; +type DeleteKvLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + DeleteKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, + >, + DeleteKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, +>; type GetClusterIdentityLayer = quickwit_common::tower::BoxLayer< quickwit_common::tower::BoxService< GetClusterIdentityRequest, @@ -2772,6 +3057,7 @@ pub struct MetastoreServiceTowerLayerStack { publish_splits_layers: Vec, mark_splits_for_deletion_layers: Vec, delete_splits_layers: Vec, + soft_delete_documents_layers: Vec, add_source_layers: Vec, update_source_layers: Vec, toggle_source_layers: Vec, @@ -2792,6 +3078,9 @@ pub struct MetastoreServiceTowerLayerStack { find_index_template_matches_layers: Vec, list_index_templates_layers: Vec, delete_index_templates_layers: Vec, + get_kv_layers: Vec, + set_kv_layers: Vec, + delete_kv_layers: Vec, get_cluster_identity_layers: Vec, } impl MetastoreServiceTowerLayerStack { @@ -3101,6 +3390,33 @@ impl MetastoreServiceTowerLayerStack { crate::metastore::MetastoreError, >, >>::Service as tower::Service>::Future: Send + 'static, + L: tower::Layer< + quickwit_common::tower::BoxService< + SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + SoftDeleteDocumentsRequest, + Response = SoftDeleteDocumentsResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service< + SoftDeleteDocumentsRequest, + >>::Future: Send + 'static, L: tower::Layer< quickwit_common::tower::BoxService< AddSourceRequest, @@ -3613,6 +3929,81 @@ impl MetastoreServiceTowerLayerStack { >>::Service as tower::Service< DeleteIndexTemplatesRequest, >>::Future: Send + 'static, + L: tower::Layer< + quickwit_common::tower::BoxService< + GetKvRequest, + GetKvResponse, + crate::metastore::MetastoreError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + GetKvRequest, + Response = GetKvResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service>::Future: Send + 'static, + L: tower::Layer< + quickwit_common::tower::BoxService< + SetKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + SetKvRequest, + Response = EmptyResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service>::Future: Send + 'static, + L: tower::Layer< + quickwit_common::tower::BoxService< + DeleteKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + DeleteKvRequest, + Response = EmptyResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service>::Future: Send + 'static, L: tower::Layer< quickwit_common::tower::BoxService< GetClusterIdentityRequest, @@ -3665,6 +4056,8 @@ impl MetastoreServiceTowerLayerStack { .push(quickwit_common::tower::BoxLayer::new(layer.clone())); self.delete_splits_layers .push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self.soft_delete_documents_layers + .push(quickwit_common::tower::BoxLayer::new(layer.clone())); self.add_source_layers .push(quickwit_common::tower::BoxLayer::new(layer.clone())); self.update_source_layers @@ -3705,6 +4098,9 @@ impl MetastoreServiceTowerLayerStack { .push(quickwit_common::tower::BoxLayer::new(layer.clone())); self.delete_index_templates_layers .push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self.get_kv_layers.push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self.set_kv_layers.push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self.delete_kv_layers.push(quickwit_common::tower::BoxLayer::new(layer.clone())); self.get_cluster_identity_layers .push(quickwit_common::tower::BoxLayer::new(layer.clone())); self @@ -3943,6 +4339,28 @@ impl MetastoreServiceTowerLayerStack { self.delete_splits_layers.push(quickwit_common::tower::BoxLayer::new(layer)); self } + pub fn stack_soft_delete_documents_layer(mut self, layer: L) -> Self + where + L: tower::Layer< + quickwit_common::tower::BoxService< + SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + SoftDeleteDocumentsRequest, + Response = SoftDeleteDocumentsResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, + { + self.soft_delete_documents_layers + .push(quickwit_common::tower::BoxLayer::new(layer)); + self + } pub fn stack_add_source_layer(mut self, layer: L) -> Self where L: tower::Layer< @@ -4344,12 +4762,69 @@ impl MetastoreServiceTowerLayerStack { .push(quickwit_common::tower::BoxLayer::new(layer)); self } - pub fn stack_get_cluster_identity_layer(mut self, layer: L) -> Self + pub fn stack_get_kv_layer(mut self, layer: L) -> Self where L: tower::Layer< quickwit_common::tower::BoxService< - GetClusterIdentityRequest, - GetClusterIdentityResponse, + GetKvRequest, + GetKvResponse, + crate::metastore::MetastoreError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + GetKvRequest, + Response = GetKvResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, + { + self.get_kv_layers.push(quickwit_common::tower::BoxLayer::new(layer)); + self + } + pub fn stack_set_kv_layer(mut self, layer: L) -> Self + where + L: tower::Layer< + quickwit_common::tower::BoxService< + SetKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + SetKvRequest, + Response = EmptyResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, + { + self.set_kv_layers.push(quickwit_common::tower::BoxLayer::new(layer)); + self + } + pub fn stack_delete_kv_layer(mut self, layer: L) -> Self + where + L: tower::Layer< + quickwit_common::tower::BoxService< + DeleteKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + DeleteKvRequest, + Response = EmptyResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, + { + self.delete_kv_layers.push(quickwit_common::tower::BoxLayer::new(layer)); + self + } + pub fn stack_get_cluster_identity_layer(mut self, layer: L) -> Self + where + L: tower::Layer< + quickwit_common::tower::BoxService< + GetClusterIdentityRequest, + GetClusterIdentityResponse, crate::metastore::MetastoreError, >, > + Send + Sync + 'static, @@ -4522,6 +4997,14 @@ impl MetastoreServiceTowerLayerStack { quickwit_common::tower::BoxService::new(inner_client.clone()), |svc, layer| layer.layer(svc), ); + let soft_delete_documents_svc = self + .soft_delete_documents_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); let add_source_svc = self .add_source_layers .into_iter() @@ -4682,6 +5165,30 @@ impl MetastoreServiceTowerLayerStack { quickwit_common::tower::BoxService::new(inner_client.clone()), |svc, layer| layer.layer(svc), ); + let get_kv_svc = self + .get_kv_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); + let set_kv_svc = self + .set_kv_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); + let delete_kv_svc = self + .delete_kv_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); let get_cluster_identity_svc = self .get_cluster_identity_layers .into_iter() @@ -4704,6 +5211,7 @@ impl MetastoreServiceTowerLayerStack { publish_splits_svc, mark_splits_for_deletion_svc, delete_splits_svc, + soft_delete_documents_svc, add_source_svc, update_source_svc, toggle_source_svc, @@ -4724,6 +5232,9 @@ impl MetastoreServiceTowerLayerStack { find_index_template_matches_svc, list_index_templates_svc, delete_index_templates_svc, + get_kv_svc, + set_kv_svc, + delete_kv_svc, get_cluster_identity_svc, }; MetastoreServiceClient::new(tower_svc_stack) @@ -4879,6 +5390,15 @@ where Error = crate::metastore::MetastoreError, Future = BoxFuture, > + + tower::Service< + SoftDeleteDocumentsRequest, + Response = SoftDeleteDocumentsResponse, + Error = crate::metastore::MetastoreError, + Future = BoxFuture< + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, + >, + > + tower::Service< AddSourceRequest, Response = EmptyResponse, @@ -5014,6 +5534,24 @@ where Error = crate::metastore::MetastoreError, Future = BoxFuture, > + + tower::Service< + GetKvRequest, + Response = GetKvResponse, + Error = crate::metastore::MetastoreError, + Future = BoxFuture, + > + + tower::Service< + SetKvRequest, + Response = EmptyResponse, + Error = crate::metastore::MetastoreError, + Future = BoxFuture, + > + + tower::Service< + DeleteKvRequest, + Response = EmptyResponse, + Error = crate::metastore::MetastoreError, + Future = BoxFuture, + > + tower::Service< GetClusterIdentityRequest, Response = GetClusterIdentityResponse, @@ -5096,6 +5634,12 @@ where ) -> crate::metastore::MetastoreResult { self.clone().call(request).await } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult { + self.clone().call(request).await + } async fn add_source( &self, request: AddSourceRequest, @@ -5216,6 +5760,24 @@ where ) -> crate::metastore::MetastoreResult { self.clone().call(request).await } + async fn get_kv( + &self, + request: GetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.clone().call(request).await + } + async fn set_kv( + &self, + request: SetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.clone().call(request).await + } + async fn delete_kv( + &self, + request: DeleteKvRequest, + ) -> crate::metastore::MetastoreResult { + self.clone().call(request).await + } async fn get_cluster_identity( &self, request: GetClusterIdentityRequest, @@ -5445,6 +6007,20 @@ where DeleteSplitsRequest::rpc_name(), )) } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult { + self.inner + .clone() + .soft_delete_documents(request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + SoftDeleteDocumentsRequest::rpc_name(), + )) + } async fn add_source( &self, request: AddSourceRequest, @@ -5725,6 +6301,48 @@ where DeleteIndexTemplatesRequest::rpc_name(), )) } + async fn get_kv( + &self, + request: GetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner + .clone() + .get_kv(request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + GetKvRequest::rpc_name(), + )) + } + async fn set_kv( + &self, + request: SetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner + .clone() + .set_kv(request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + SetKvRequest::rpc_name(), + )) + } + async fn delete_kv( + &self, + request: DeleteKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner + .clone() + .delete_kv(request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + DeleteKvRequest::rpc_name(), + )) + } async fn get_cluster_identity( &self, request: GetClusterIdentityRequest, @@ -5909,6 +6527,17 @@ for MetastoreServiceGrpcServerAdapter { .map(tonic::Response::new) .map_err(crate::error::grpc_error_to_grpc_status) } + async fn soft_delete_documents( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner + .0 + .soft_delete_documents(request.into_inner()) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + } async fn add_source( &self, request: tonic::Request, @@ -6129,6 +6758,39 @@ for MetastoreServiceGrpcServerAdapter { .map(tonic::Response::new) .map_err(crate::error::grpc_error_to_grpc_status) } + async fn get_kv( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner + .0 + .get_kv(request.into_inner()) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + } + async fn set_kv( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner + .0 + .set_kv(request.into_inner()) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + } + async fn delete_kv( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner + .0 + .delete_kv(request.into_inner()) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + } async fn get_cluster_identity( &self, request: tonic::Request, @@ -6619,6 +7281,36 @@ pub mod metastore_service_grpc_client { ); self.inner.unary(req, path, codec).await } + /// Soft-deletes individual documents within published splits. + pub async fn soft_delete_documents( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.metastore.MetastoreService/SoftDeleteDocuments", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "quickwit.metastore.MetastoreService", + "SoftDeleteDocuments", + ), + ); + self.inner.unary(req, path, codec).await + } /// Adds a source. pub async fn add_source( &mut self, @@ -7193,6 +7885,74 @@ pub mod metastore_service_grpc_client { ); self.inner.unary(req, path, codec).await } + /// Gets a value by key from the cluster-wide key-value store. + pub async fn get_kv( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result, tonic::Status> { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.metastore.MetastoreService/GetKv", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert(GrpcMethod::new("quickwit.metastore.MetastoreService", "GetKv")); + self.inner.unary(req, path, codec).await + } + /// Sets a key-value pair in the cluster-wide key-value store. + pub async fn set_kv( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result, tonic::Status> { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.metastore.MetastoreService/SetKv", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert(GrpcMethod::new("quickwit.metastore.MetastoreService", "SetKv")); + self.inner.unary(req, path, codec).await + } + /// Deletes a key from the cluster-wide key-value store. + pub async fn delete_kv( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result, tonic::Status> { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.metastore.MetastoreService/DeleteKv", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new("quickwit.metastore.MetastoreService", "DeleteKv"), + ); + self.inner.unary(req, path, codec).await + } /// Get cluster identity pub async fn get_cluster_identity( &mut self, @@ -7325,6 +8085,14 @@ pub mod metastore_service_grpc_server { &self, request: tonic::Request, ) -> std::result::Result, tonic::Status>; + /// Soft-deletes individual documents within published splits. + async fn soft_delete_documents( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; /// Adds a source. async fn add_source( &self, @@ -7471,6 +8239,21 @@ pub mod metastore_service_grpc_server { &self, request: tonic::Request, ) -> std::result::Result, tonic::Status>; + /// Gets a value by key from the cluster-wide key-value store. + async fn get_kv( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status>; + /// Sets a key-value pair in the cluster-wide key-value store. + async fn set_kv( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status>; + /// Deletes a key from the cluster-wide key-value store. + async fn delete_kv( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status>; /// Get cluster identity async fn get_cluster_identity( &self, @@ -8176,6 +8959,55 @@ pub mod metastore_service_grpc_server { }; Box::pin(fut) } + "/quickwit.metastore.MetastoreService/SoftDeleteDocuments" => { + #[allow(non_camel_case_types)] + struct SoftDeleteDocumentsSvc(pub Arc); + impl< + T: MetastoreServiceGrpc, + > tonic::server::UnaryService + for SoftDeleteDocumentsSvc { + type Response = super::SoftDeleteDocumentsResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::soft_delete_documents( + &inner, + request, + ) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = SoftDeleteDocumentsSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } "/quickwit.metastore.MetastoreService/AddSource" => { #[allow(non_camel_case_types)] struct AddSourceSvc(pub Arc); @@ -9138,6 +9970,140 @@ pub mod metastore_service_grpc_server { }; Box::pin(fut) } + "/quickwit.metastore.MetastoreService/GetKv" => { + #[allow(non_camel_case_types)] + struct GetKvSvc(pub Arc); + impl< + T: MetastoreServiceGrpc, + > tonic::server::UnaryService for GetKvSvc { + type Response = super::GetKvResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::get_kv(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = GetKvSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/quickwit.metastore.MetastoreService/SetKv" => { + #[allow(non_camel_case_types)] + struct SetKvSvc(pub Arc); + impl< + T: MetastoreServiceGrpc, + > tonic::server::UnaryService for SetKvSvc { + type Response = super::EmptyResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::set_kv(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = SetKvSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/quickwit.metastore.MetastoreService/DeleteKv" => { + #[allow(non_camel_case_types)] + struct DeleteKvSvc(pub Arc); + impl< + T: MetastoreServiceGrpc, + > tonic::server::UnaryService + for DeleteKvSvc { + type Response = super::EmptyResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::delete_kv(&inner, request) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = DeleteKvSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } "/quickwit.metastore.MetastoreService/GetClusterIdentity" => { #[allow(non_camel_case_types)] struct GetClusterIdentitySvc(pub Arc); diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs index 9c6f7f5b70d..b2dfd344806 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs @@ -187,6 +187,9 @@ pub struct SearchRequest { pub ignore_missing_indexes: bool, #[prost(string, optional, tag = "19")] pub split_id: ::core::option::Option<::prost::alloc::string::String>, + /// The user agent of the client that initiated the search request. + #[prost(string, optional, tag = "20")] + pub user_agent: ::core::option::Option<::prost::alloc::string::String>, } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] @@ -197,7 +200,7 @@ pub struct SortField { pub sort_order: i32, /// Optional sort value format for datetime field only. /// If none, the default output format for datetime field is - /// unix_timestamp_nanos. + /// unix_timestamp_millis. #[prost(enumeration = "SortDatetimeFormat", optional, tag = "3")] pub sort_datetime_format: ::core::option::Option, } @@ -214,9 +217,6 @@ pub struct SearchResponse { /// server-side and expressed in microseconds. #[prost(uint64, tag = "3")] pub elapsed_time_micros: u64, - /// The searcherrors that occurred formatted as string. - #[prost(string, repeated, tag = "4")] - pub errors: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, /// Postcard-encoded aggregation response #[prost(bytes = "vec", optional, tag = "9")] pub aggregation_postcard: ::core::option::Option<::prost::alloc::vec::Vec>, @@ -327,6 +327,9 @@ pub struct SplitIdAndFooterOffsets { /// The number of docs in the split #[prost(uint64, tag = "6")] pub num_docs: u64, + /// Tantivy doc IDs that have been soft-deleted from this split + #[prost(uint32, repeated, tag = "7")] + pub soft_deleted_doc_ids: ::prost::alloc::vec::Vec, } /// Hits returned by a FetchDocRequest. /// @@ -407,16 +410,16 @@ pub struct PartialHit { } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Ord, PartialOrd)] -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, ::prost::Message)] pub struct SortByValue { - #[prost(oneof = "sort_by_value::SortValue", tags = "1, 2, 3, 4")] + #[prost(oneof = "sort_by_value::SortValue", tags = "1, 2, 3, 4, 5, 6")] pub sort_value: ::core::option::Option, } /// Nested message and enum types in `SortByValue`. pub mod sort_by_value { #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[serde(rename_all = "snake_case")] - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, ::prost::Oneof)] pub enum SortValue { #[prost(uint64, tag = "1")] U64(u64), @@ -426,6 +429,10 @@ pub mod sort_by_value { F64(f64), #[prost(bool, tag = "4")] Boolean(bool), + #[prost(string, tag = "5")] + Str(::prost::alloc::string::String), + #[prost(int64, tag = "6")] + Datetime(i64), } } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] @@ -886,6 +893,35 @@ pub mod search_service_client { .insert(GrpcMethod::new("quickwit.search.SearchService", "FetchDocs")); self.inner.unary(req, path, codec).await } + /// Streams document contents from the document store. + /// This method takes `PartialHit`s and streams back `LeafHit`s in batches + /// to avoid hitting gRPC message size limits. + pub async fn stream_fetch_docs( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response>, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.search.SearchService/StreamFetchDocs", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new("quickwit.search.SearchService", "StreamFetchDocs"), + ); + self.inner.server_streaming(req, path, codec).await + } /// Root list terms API. /// This RPC identifies the set of splits on which the query should run on, /// and dispatches the several calls to `LeafListTerms`. @@ -1167,6 +1203,22 @@ pub mod search_service_server { tonic::Response, tonic::Status, >; + /// Server streaming response type for the StreamFetchDocs method. + type StreamFetchDocsStream: tonic::codegen::tokio_stream::Stream< + Item = std::result::Result, + > + + std::marker::Send + + 'static; + /// Streams document contents from the document store. + /// This method takes `PartialHit`s and streams back `LeafHit`s in batches + /// to avoid hitting gRPC message size limits. + async fn stream_fetch_docs( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; /// Root list terms API. /// This RPC identifies the set of splits on which the query should run on, /// and dispatches the several calls to `LeafListTerms`. @@ -1451,6 +1503,53 @@ pub mod search_service_server { }; Box::pin(fut) } + "/quickwit.search.SearchService/StreamFetchDocs" => { + #[allow(non_camel_case_types)] + struct StreamFetchDocsSvc(pub Arc); + impl< + T: SearchService, + > tonic::server::ServerStreamingService + for StreamFetchDocsSvc { + type Response = super::FetchDocsResponse; + type ResponseStream = T::StreamFetchDocsStream; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::stream_fetch_docs(&inner, request) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = StreamFetchDocsSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.server_streaming(method, req).await; + Ok(res) + }; + Box::pin(fut) + } "/quickwit.search.SearchService/RootListTerms" => { #[allow(non_camel_case_types)] struct RootListTermsSvc(pub Arc); diff --git a/quickwit/quickwit-proto/src/control_plane/mod.rs b/quickwit/quickwit-proto/src/control_plane/mod.rs index 4278ec104eb..fd278bc6199 100644 --- a/quickwit/quickwit-proto/src/control_plane/mod.rs +++ b/quickwit/quickwit-proto/src/control_plane/mod.rs @@ -138,6 +138,12 @@ impl RpcName for AdviseResetShardsRequest { } } +impl RpcName for SwapIndexingPipelinesRequest { + fn rpc_name() -> &'static str { + "swap_indexing_pipelines" + } +} + impl GetOrCreateOpenShardsFailureReason { pub fn create_failure( &self, @@ -154,6 +160,24 @@ impl GetOrCreateOpenShardsFailureReason { } } +impl RpcName for EnableMaintenanceModeRequest { + fn rpc_name() -> &'static str { + "enable_maintenance_mode" + } +} + +impl RpcName for DisableMaintenanceModeRequest { + fn rpc_name() -> &'static str { + "disable_maintenance_mode" + } +} + +impl RpcName for GetMaintenanceModeRequest { + fn rpc_name() -> &'static str { + "get_maintenance_mode" + } +} + impl From for GetOrCreateOpenShardsSubrequest { fn from(metastore_open_shard_subrequest: OpenShardSubrequest) -> Self { let index_id = metastore_open_shard_subrequest.index_uid().index_id.clone(); diff --git a/quickwit/quickwit-proto/src/getters.rs b/quickwit/quickwit-proto/src/getters.rs index a327c1717a7..73847554b7c 100644 --- a/quickwit/quickwit-proto/src/getters.rs +++ b/quickwit/quickwit-proto/src/getters.rs @@ -136,6 +136,7 @@ generate_getters! { ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, + SoftDeleteDocumentsRequest, UpdateSplitsDeleteOpstampRequest } diff --git a/quickwit/quickwit-proto/src/lib.rs b/quickwit/quickwit-proto/src/lib.rs index f4ddb734d2a..f89fdb97687 100644 --- a/quickwit/quickwit-proto/src/lib.rs +++ b/quickwit/quickwit-proto/src/lib.rs @@ -28,7 +28,8 @@ use tracing_opentelemetry::OpenTelemetrySpanExt; pub mod cluster; pub mod control_plane; -pub use {bytes, tonic}; +pub use bytes; +pub use tonic; pub mod developer; pub mod error; mod getters; diff --git a/quickwit/quickwit-proto/src/search/mod.rs b/quickwit/quickwit-proto/src/search/mod.rs index 307de262a70..caba73828cd 100644 --- a/quickwit/quickwit-proto/src/search/mod.rs +++ b/quickwit/quickwit-proto/src/search/mod.rs @@ -17,6 +17,8 @@ use std::fmt; use std::io::{self, Read}; use prost::Message; +use quickwit_common::numeric_types::num_proj::ProjectedNumber; +use quickwit_common::numeric_types::{num_cmp, num_proj}; pub use sort_by_value::SortValue; include!("../codegen/quickwit/quickwit.search.rs"); @@ -83,6 +85,8 @@ impl SortByValue { } } Some(SortValue::Boolean(b)) => Bool(b), + Some(SortValue::Str(s)) => String(s), + Some(SortValue::Datetime(dt)) => Number(dt.into()), None => Null, } } @@ -104,18 +108,7 @@ impl SortByValue { return None; } } - // Strings that can be converted to a number are accepted. - // Some clients (like JS clients) can't easily handle large integers - // without losing precision, so we accept them as strings. - String(value) => { - if let Ok(number) = value.parse::() { - Some(SortValue::I64(number)) - } else if let Ok(number) = value.parse::() { - Some(SortValue::U64(number)) - } else { - return None; - } - } + String(value) => Some(SortValue::Str(value)), Array(_) | Object(_) => return None, }; Some(SortByValue { sort_value }) @@ -132,25 +125,33 @@ impl Eq for SortValue {} impl Ord for SortValue { #[inline] fn cmp(&self, other: &Self) -> Ordering { - // We make sure to end up with a total order. - match (*self, *other) { + match (self, other) { // Same types. - (SortValue::U64(left), SortValue::U64(right)) => left.cmp(&right), - (SortValue::I64(left), SortValue::I64(right)) => left.cmp(&right), - (SortValue::Boolean(left), SortValue::Boolean(right)) => left.cmp(&right), - // We half the logic by making sure we keep - // the "stronger" type on the left. + (SortValue::U64(left), SortValue::U64(right)) => left.cmp(right), + (SortValue::I64(left), SortValue::I64(right)) => left.cmp(right), + (SortValue::Boolean(left), SortValue::Boolean(right)) => left.cmp(right), + (SortValue::Str(left), SortValue::Str(right)) => left.cmp(right), + (SortValue::F64(left), SortValue::F64(right)) => left.total_cmp(right), + (SortValue::Datetime(left), SortValue::Datetime(right)) => left.cmp(right), + // Different numeric types but can still be compared. + (SortValue::U64(left), SortValue::F64(right)) => { + num_cmp::cmp_u64_f64(*left, *right).expect("unexpected float comparison") + } + (SortValue::F64(left), SortValue::U64(right)) => num_cmp::cmp_u64_f64(*right, *left) + .expect("unexpected float comparison") + .reverse(), + (SortValue::I64(left), SortValue::F64(right)) => { + num_cmp::cmp_i64_f64(*left, *right).expect("unexpected float comparison") + } + (SortValue::F64(left), SortValue::I64(right)) => num_cmp::cmp_i64_f64(*right, *left) + .expect("unexpected float comparison") + .reverse(), + (SortValue::I64(left), SortValue::U64(right)) => num_cmp::cmp_i64_u64(*left, *right), (SortValue::U64(left), SortValue::I64(right)) => { - if left > i64::MAX as u64 { - return Ordering::Greater; - } - (left as i64).cmp(&right) + num_cmp::cmp_i64_u64(*right, *left).reverse() } - (SortValue::F64(left), SortValue::F64(right)) => left.total_cmp(&right), - (SortValue::F64(left), SortValue::U64(right)) => left.total_cmp(&(right as f64)), - (SortValue::F64(left), SortValue::I64(right)) => left.total_cmp(&(right as f64)), - (SortValue::Boolean(left), right) => SortValue::U64(left as u64).cmp(&right), - (left, right) => right.cmp(&left).reverse(), + // Incompatible types, they are sorted one after another. + (left, right) => left.type_sort_key().cmp(&right.type_sort_key()), } } } @@ -165,7 +166,7 @@ impl std::hash::Hash for SortValue { fn hash(&self, state: &mut H) { let this = self.normalize(); std::mem::discriminant(&this).hash(state); - match this { + match &this { SortValue::U64(number) => { number.hash(state); } @@ -178,6 +179,12 @@ impl std::hash::Hash for SortValue { SortValue::Boolean(b) => { b.hash(state); } + SortValue::Str(s) => { + s.hash(state); + } + SortValue::Datetime(dt) => { + dt.hash(state); + } } } } @@ -188,27 +195,36 @@ impl SortValue { /// For number, we prefer to represent them, in order, as i64, then as u64 and finally as f64. pub fn normalize(&self) -> Self { match self { - SortValue::I64(_) => *self, - SortValue::Boolean(_) => *self, - SortValue::U64(number) => { - if let Ok(number) = (*number).try_into() { - SortValue::I64(number) - } else { - *self - } - } - SortValue::F64(number) => { - let number = *number; - if number.ceil() == number { - // number is not NaN, and is a natural number - if number >= i64::MIN as f64 && number <= i64::MAX as f64 { - return SortValue::I64(number as i64); - } else if number.is_sign_positive() && number <= u64::MAX as f64 { - return SortValue::U64(number as u64); + SortValue::I64(_) => self.clone(), + SortValue::Boolean(_) => self.clone(), + SortValue::Str(_) => self.clone(), + SortValue::U64(number) => match num_proj::u64_to_i64(*number) { + ProjectedNumber::Exact(number) => SortValue::I64(number), + _ => self.clone(), + }, + SortValue::F64(float) => match num_proj::f64_to_i64(*float) { + ProjectedNumber::Exact(number) => SortValue::I64(number), + ProjectedNumber::AfterLast => { + if let ProjectedNumber::Exact(number) = num_proj::f64_to_u64(*float) { + SortValue::U64(number) + } else { + self.clone() } } - *self - } + _ => self.clone(), + }, + SortValue::Datetime(_) => self.clone(), + } + } + + pub fn type_sort_key(&self) -> TypeSortKey { + match self { + SortValue::U64(_) => TypeSortKey::Numeric, + SortValue::I64(_) => TypeSortKey::Numeric, + SortValue::F64(_) => TypeSortKey::Numeric, + SortValue::Boolean(_) => TypeSortKey::Boolean, + SortValue::Str(_) => TypeSortKey::Str, + SortValue::Datetime(_) => TypeSortKey::DateTime, } } } @@ -216,14 +232,26 @@ impl SortValue { impl PartialHit { /// Helper to get access to the 1st sort value pub fn sort_value(&self) -> Option { - if let Some(sort_value) = self.sort_value { - sort_value.sort_value + if let Some(sort_value) = &self.sort_value { + sort_value.sort_value.clone() } else { None } } } +/// Defines the order between types when sorting on a field with multiple types. +/// Expected order: +/// - Asc: numeric -> string -> boolean -> datetime +/// - Desc: datetime -> boolean -> string -> numeric +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum TypeSortKey { + Numeric, + Str, + Boolean, + DateTime, +} + /// Serializes the Split fields. /// /// `fields_metadata` has to be sorted. diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml index 066c00c0ff7..f24d8662715 100644 --- a/quickwit/quickwit-query/Cargo.toml +++ b/quickwit/quickwit-query/Cargo.toml @@ -15,9 +15,6 @@ anyhow = { workspace = true } base64 = { workspace = true } bitpacking = { workspace = true } hex = { workspace = true } -lindera-core = { workspace = true, optional = true } -lindera-dictionary = { workspace = true, optional = true } -lindera-tokenizer = { workspace = true, optional = true } once_cell = { workspace = true } regex = { workspace = true } serde = { workspace = true } @@ -29,7 +26,6 @@ tracing = { workspace = true } time = { workspace = true } thiserror = { workspace = true } rustc-hash = { workspace = true } -whichlang = { workspace = true, optional = true } quickwit-common = { workspace = true } quickwit-datetime = { workspace = true } @@ -42,19 +38,6 @@ time = { workspace = true } quickwit-common = { workspace = true, features = ["testsuite"] } -[features] -multilang = [ - "lindera-core", - "lindera-dictionary", - "lindera-tokenizer", - "whichlang", - "tantivy/stemmer", -] - [[bench]] name = "tokenizers_bench" harness = false - -[[bench]] -name = "multilang_tokenizers_bench" -harness = false diff --git a/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs b/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs deleted file mode 100644 index 61755dea556..00000000000 --- a/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright 2021-Present Datadog, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use criterion::{Criterion, Throughput, black_box, criterion_group, criterion_main}; -use quickwit_query::create_default_quickwit_tokenizer_manager; -use tantivy::tokenizer::{TextAnalyzer, Token, TokenStream}; - -// A random ascii string of length 100 chars. -const ASCII_SHORT: &str = "It is a long established fact"; -static ASCII_LONG: &str = r#"It is a long established fact that a reader will be distracted by the readable content of a - page when looking at its layout. The point of using Lorem Ipsum is that it has a - more-or-less normal distribution of letters, as opposed to using 'Content here, content - here', making it look like readable English. Many desktop publishing packages and web page - editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will - uncover many web sites still in their infancy. Various versions have evolved over the years, - sometimes by accident, sometimes on purpose (injected humour and the like)."#; -const JPN_SHORT: &str = "日本ごです。 とても素敵な言葉ですね"; -const JPN_LONG: &str = r#"日本ごです。 和名の由来は、 - 太陽の動きにつれてその方向を追うように花が回るといわれたことから。 - ただしこの動きは生長に伴うものであるため、 - 実際に太陽を追って動くのは生長が盛んな若い時期だけである。 - 若いヒマワリの茎の上部の葉は太陽に正対になるように動き、 - 朝には東を向いていたのが夕方には西を向く。日没後はまもなく起きあがり、 - 夜明け前にはふたたび東に向く。この運動はつぼみを付ける頃まで続くが、 - つぼみが大きくなり花が開く素敵な言葉ですね."#; -const CMN_SHORT: &str = "滚滚长江东逝水,浪花淘尽英雄。"; -const CMN_LONG: &str = r#"滚滚长江东逝水,浪花淘尽英雄。是非成败转头空,青山依旧在,几度夕阳红。 - 白发渔樵江渚上,惯看秋月春风。一壶浊酒喜相逢,古今多少事,都付笑谈中。 - 是非成败转头空,青山依旧在,惯看秋月春风。一壶浊酒喜相逢,古今多少事, - 滚滚长江东逝水,浪花淘尽英雄。 几度夕阳红。白发渔樵江渚上,都付笑谈中。"#; -const KOR_SHORT: &str = "안녕하세요. 반갑습니다."; -const KOR_LONG: &str = r#" -포근히 내려오는 눈밭속에서는 -낯이 붉은 處女아이들도 깃들이어 오는 소리… -울고 -웃고 -수구리고 -새파라니 얼어서 -運命들이 모두다 안끼어 드는 소리… -큰놈에겐 큰 눈물자국, 작은놈에겐 작은 웃음 흔적 -큰이얘기 작은이얘기들이 오부록이 도란 그리며 안끼어 오는 소리 -끊임없이 내리는 눈발 속에서는 -山도 山도 靑山도 안끼어 드는 소리 -"#; - -fn process_tokens(analyzer: &mut TextAnalyzer, text: &str) -> Vec { - let mut token_stream = analyzer.token_stream(text); - let mut tokens: Vec = Vec::new(); - token_stream.process(&mut |token: &Token| tokens.push(token.clone())); - tokens -} - -pub fn tokenizers_throughput_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("multilang"); - let tokenizer_manager = create_default_quickwit_tokenizer_manager(); - let mut default_tokenizer = tokenizer_manager.get_tokenizer("default").unwrap(); - let mut multilang_tokenizer = tokenizer_manager.get_tokenizer("multilang").unwrap(); - let mut chinese_tokenizer = tokenizer_manager - .get_tokenizer("chinese_compatible") - .unwrap(); - - group - .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) - .bench_with_input("default-tokenize-short", ASCII_SHORT, |b, text| { - b.iter(|| process_tokens(&mut default_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) - .bench_with_input("default-tokenize-long", ASCII_LONG, |b, text| { - b.iter(|| process_tokens(&mut default_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) - .bench_with_input("multilang-eng-tokenize-short", ASCII_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) - .bench_with_input("multilang-eng-tokenize-long", ASCII_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - let short_with_prefix = "ENG:".to_string() + ASCII_SHORT; - group - .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) - .bench_with_input( - "multilang-tokenize-short-with-prefix", - &short_with_prefix, - |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }, - ); - let long_with_prefix = "ENG:".to_string() + ASCII_LONG; - group - .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) - .bench_with_input( - "multilang-tokenize-long-with-prefix", - &long_with_prefix, - |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }, - ); - group - .throughput(Throughput::Bytes(JPN_SHORT.len() as u64)) - .bench_with_input("multilang-tokenize-jpn-short", JPN_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(JPN_LONG.len() as u64)) - .bench_with_input("multilang-tokenize-jpn-long", JPN_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(CMN_SHORT.len() as u64)) - .bench_with_input("multilang-tokenize-cmn-short", CMN_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(CMN_LONG.len() as u64)) - .bench_with_input("multilang-tokenize-cmn-long", CMN_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(KOR_SHORT.len() as u64)) - .bench_with_input("multilang-tokenize-kor-short", KOR_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(KOR_LONG.len() as u64)) - .bench_with_input("multilang-tokenize-kor-long", KOR_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(CMN_SHORT.len() as u64)) - .bench_with_input( - "chinese-compatible-tokenize-cmn-short", - CMN_SHORT, - |b, text| { - b.iter(|| process_tokens(&mut chinese_tokenizer, black_box(text))); - }, - ); - group - .throughput(Throughput::Bytes(CMN_LONG.len() as u64)) - .bench_with_input( - "chinese-compatible-tokenize-cmn-long", - CMN_LONG, - |b, text| { - b.iter(|| process_tokens(&mut chinese_tokenizer, black_box(text))); - }, - ); -} - -criterion_group!( - tokenizers_throughput_benches, - tokenizers_throughput_benchmark -); -criterion_main!(tokenizers_throughput_benches); diff --git a/quickwit/quickwit-query/src/lib.rs b/quickwit/quickwit-query/src/lib.rs index b2040f73daa..8f70e155933 100644 --- a/quickwit/quickwit-query/src/lib.rs +++ b/quickwit/quickwit-query/src/lib.rs @@ -38,8 +38,6 @@ pub(crate) use not_nan_f32::NotNaNf32; pub use query_ast::utils::find_field_or_hit_dynamic; use serde::{Deserialize, Serialize}; pub use tantivy::query::Query as TantivyQuery; -#[cfg(feature = "multilang")] -pub use tokenizers::MultiLangTokenizer; pub use tokenizers::{ CodeTokenizer, DEFAULT_REMOVE_TOKEN_LENGTH, create_default_quickwit_tokenizer_manager, get_quickwit_fastfield_normalizer_manager, diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index 84176f4a4aa..7b24a66163d 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -247,7 +247,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", @@ -290,7 +289,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", @@ -335,7 +333,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", @@ -398,7 +395,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", diff --git a/quickwit/quickwit-query/src/tokenizers/mod.rs b/quickwit/quickwit-query/src/tokenizers/mod.rs index d086c36a977..5a90715075e 100644 --- a/quickwit/quickwit-query/src/tokenizers/mod.rs +++ b/quickwit/quickwit-query/src/tokenizers/mod.rs @@ -14,8 +14,6 @@ mod chinese_compatible; mod code_tokenizer; -#[cfg(feature = "multilang")] -mod multilang; mod tokenizer_manager; use once_cell::sync::Lazy; @@ -26,8 +24,6 @@ use tantivy::tokenizer::{ use self::chinese_compatible::ChineseTokenizer; pub use self::code_tokenizer::CodeTokenizer; -#[cfg(feature = "multilang")] -pub use self::multilang::MultiLangTokenizer; pub use self::tokenizer_manager::{RAW_TOKENIZER_NAME, TokenizerManager}; pub const DEFAULT_REMOVE_TOKEN_LENGTH: usize = 255; @@ -58,17 +54,6 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { .filter(LowerCaser) .build(); tokenizer_manager.register("default", default_tokenizer, true); - #[cfg(feature = "multilang")] - { - let en_stem_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) - .filter(LowerCaser) - .filter(tantivy::tokenizer::Stemmer::new( - tantivy::tokenizer::Language::English, - )) - .build(); - tokenizer_manager.register("en_stem", en_stem_tokenizer, true); - } tokenizer_manager.register("whitespace", WhitespaceTokenizer::default(), false); let chinese_tokenizer = TextAnalyzer::builder(ChineseTokenizer) @@ -94,15 +79,6 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { .build(), true, ); - #[cfg(feature = "multilang")] - tokenizer_manager.register( - "multilang_default", - TextAnalyzer::builder(MultiLangTokenizer::default()) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) - .filter(LowerCaser) - .build(), - true, - ); tokenizer_manager } diff --git a/quickwit/quickwit-query/src/tokenizers/multilang.rs b/quickwit/quickwit-query/src/tokenizers/multilang.rs deleted file mode 100644 index a62d2ff151c..00000000000 --- a/quickwit/quickwit-query/src/tokenizers/multilang.rs +++ /dev/null @@ -1,334 +0,0 @@ -// Copyright 2021-Present Datadog, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use lindera_core::mode::Mode; -use lindera_dictionary::{DictionaryConfig, DictionaryKind, load_dictionary_from_config}; -use lindera_tokenizer::token::Token as LinderaToken; -use lindera_tokenizer::tokenizer::Tokenizer as LinderaTokenizer; -use once_cell::sync::Lazy; -use tantivy::tokenizer::{SimpleTokenStream, SimpleTokenizer, Token, TokenStream, Tokenizer}; -use whichlang::{Lang, detect_language}; - -// Note(fmassot): we use `lindera_tokenizer::tokenizer::Tokenizer` and not -// `use lindera_tantivy::tokenizer::LinderaTokenizer` to avoid -// costly copy of lindera dictionaries each time we clone the `MultiLangTokenizer`. - -/// Mandarin chinese tokenizer. -static CMN_TOKENIZER: Lazy = Lazy::new(|| { - let cmn_dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::CcCedict), - path: None, - }; - let cmn_dictionary = load_dictionary_from_config(cmn_dictionary_config) - .expect("Lindera `CcCedict` dictionary must be present"); - LinderaTokenizer::new(cmn_dictionary, None, Mode::Normal) -}); - -/// Japanese tokenizer. -static JPN_TOKENIZER: Lazy = Lazy::new(|| { - let jpn_dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::IPADIC), - path: None, - }; - let jpn_dictionary = load_dictionary_from_config(jpn_dictionary_config) - .expect("Lindera `IPADIC` dictionary must be present"); - LinderaTokenizer::new(jpn_dictionary, None, Mode::Normal) -}); - -/// Korean tokenizer. -static KOR_TOKENIZER: Lazy = Lazy::new(|| { - let kor_dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::KoDic), - path: None, - }; - let kor_dictionary = load_dictionary_from_config(kor_dictionary_config) - .expect("Lindera `KoDic` dictionary must be present"); - LinderaTokenizer::new(kor_dictionary, None, Mode::Normal) -}); - -/// Multilanguage tokenizer that uses the `whichlang` to detect the language of the text -/// and uses the appropriate tokenizer for the detected language: -/// - lindera for Chinese, Japanese, and Korean. -/// - Quickwit's default tokenizer for other languages. -/// -/// It is possible to bypass the language detection by prefixing the text with the language code -/// followed by a colon. For example, `KOR:일본입니다` will be tokenized by the korean tokenizer. -/// Current supported prefix are: -/// - `KOR:` for Korean tokenizer -/// - `JPN:` for Japanese tokenizer -/// - `CMN:` for Chinese tokenizer -/// - `ENG:` for Quickwit's default tokenizer -#[derive(Clone, Default)] -pub struct MultiLangTokenizer { - default_tokenizer: SimpleTokenizer, - token: Token, -} - -impl Tokenizer for MultiLangTokenizer { - type TokenStream<'a> = MultiLanguageTokenStream<'a>; - fn token_stream<'a>(&'a mut self, text: &'a str) -> MultiLanguageTokenStream<'a> { - self.token.reset(); - let (language_prefix, text_to_tokenize) = get_language_from_prefix(text); - // If the text is empty, we return an empty token stream. - // `whichlang::detect_language` panicks if the text is empty. - if text.trim().is_empty() { - return MultiLanguageTokenStream::Empty; - } - let language = language_prefix.unwrap_or_else(|| detect_language(text_to_tokenize)); - match language { - Lang::Cmn => { - let lindera_token_stream = LinderaTokenStream { - tokens: CMN_TOKENIZER - .tokenize(text_to_tokenize) - .expect("tokenize method should never fail"), - token: &mut self.token, - }; - MultiLanguageTokenStream::Lindera(lindera_token_stream) - } - Lang::Jpn => { - let lindera_token_stream = LinderaTokenStream { - tokens: JPN_TOKENIZER - .tokenize(text_to_tokenize) - .expect("tokenize method should never fail"), - token: &mut self.token, - }; - MultiLanguageTokenStream::Lindera(lindera_token_stream) - } - Lang::Kor => { - let lindera_token_stream = LinderaTokenStream { - tokens: KOR_TOKENIZER - .tokenize(text_to_tokenize) - .expect("tokenize method should never fail"), - token: &mut self.token, - }; - MultiLanguageTokenStream::Lindera(lindera_token_stream) - } - _ => MultiLanguageTokenStream::Simple( - self.default_tokenizer.token_stream(text_to_tokenize), - ), - } - } -} - -/// Gets the language defined by a prefix `{ID}:text` where ID being the 3-letter language used by -/// whichlang) and returns the language and the text without the prefix. If the prefix is not -/// recognized, the language is `None` and the text is the original. -fn get_language_from_prefix(text: &str) -> (Option, &str) { - let prefix_bytes = &text.as_bytes()[0..std::cmp::min(4, text.len())]; - // TODO: refactor. - let prefix_language = match prefix_bytes { - b"CMN:" => Some(Lang::Cmn), - b"ENG:" => Some(Lang::Eng), - b"JPN:" => Some(Lang::Jpn), - b"KOR:" => Some(Lang::Kor), - _ => None, - }; - let text_without_prefix = if prefix_language.is_some() { - // This is safe as we know that the prefix is made of 4 ascii characters. - &text[4..] - } else { - text - }; - (prefix_language, text_without_prefix) -} -pub enum MultiLanguageTokenStream<'a> { - Empty, - Lindera(LinderaTokenStream<'a>), - Simple(SimpleTokenStream<'a>), -} - -impl TokenStream for MultiLanguageTokenStream<'_> { - fn advance(&mut self) -> bool { - match self { - MultiLanguageTokenStream::Empty => false, - MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.advance(), - MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.advance(), - } - } - - fn token(&self) -> &Token { - match self { - MultiLanguageTokenStream::Empty => { - panic!("Cannot call token() on an empty token stream.") - } - MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.token(), - MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.token(), - } - } - - fn token_mut(&mut self) -> &mut Token { - match self { - MultiLanguageTokenStream::Empty => { - panic!("Cannot call token_mut() on an empty token stream.") - } - MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.token_mut(), - MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.token_mut(), - } - } -} - -pub struct LinderaTokenStream<'a> { - pub tokens: Vec>, - pub token: &'a mut Token, -} - -impl TokenStream for LinderaTokenStream<'_> { - fn advance(&mut self) -> bool { - if self.tokens.is_empty() { - return false; - } - let token = self.tokens.remove(0); - self.token.text = token.text.to_string(); - self.token.offset_from = token.byte_start; - self.token.offset_to = token.byte_end; - self.token.position = token.position; - self.token.position_length = token.position_length; - - true - } - - fn token(&self) -> &Token { - self.token - } - - fn token_mut(&mut self) -> &mut Token { - self.token - } -} - -#[cfg(test)] -mod tests { - use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; - - use super::{MultiLangTokenizer, MultiLanguageTokenStream, get_language_from_prefix}; - - fn test_helper(mut tokenizer: MultiLanguageTokenStream) -> Vec { - let mut tokens: Vec = Vec::new(); - tokenizer.process(&mut |token: &Token| tokens.push(token.clone())); - tokens - } - - #[test] - fn test_multilanguage_tokenizer_cmn() { - let mut tokenizer = MultiLangTokenizer::default(); - let tokens = test_helper( - tokenizer.token_stream("地址1,包含無效的字元 (包括符號與不標準的asci阿爾發字元"), - ); - assert_eq!(tokens.len(), 19); - { - let token = &tokens[0]; - assert_eq!(token.text, "地址"); - assert_eq!(token.offset_from, 0); - assert_eq!(token.offset_to, 6); - assert_eq!(token.position, 0); - assert_eq!(token.position_length, 1); - } - } - - #[test] - fn test_multilanguage_tokenizer_jpn() { - let mut tokenizer = MultiLangTokenizer::default(); - { - let tokens = test_helper(tokenizer.token_stream("すもももももももものうち")); - assert_eq!(tokens.len(), 7); - { - let token = &tokens[0]; - assert_eq!(token.text, "すもも"); - assert_eq!(token.offset_from, 0); - assert_eq!(token.offset_to, 9); - assert_eq!(token.position, 0); - assert_eq!(token.position_length, 1); - } - } - { - // Force usage of JPN tokenizer. - let tokens = test_helper(tokenizer.token_stream("JPN:すもももももももものうち")); - assert_eq!(tokens.len(), 7); - } - { - // Force usage of ENG tokenizer. - // This tokenizer will return only one token. - let tokens = test_helper(tokenizer.token_stream("ENG:すもももももももものうち")); - assert_eq!(tokens.len(), 1); - } - } - - #[test] - fn test_multilanguage_tokenizer_kor() { - let mut tokenizer = MultiLangTokenizer::default(); - { - let tokens = test_helper(tokenizer.token_stream("일본입니다. 매우 멋진 단어입니다.")); - assert_eq!(tokens.len(), 11); - { - let token = &tokens[0]; - assert_eq!(token.text, "일본"); - assert_eq!(token.offset_from, 0); - assert_eq!(token.offset_to, 6); - assert_eq!(token.position, 0); - assert_eq!(token.position_length, 1); - } - } - { - let tokens = - test_helper(tokenizer.token_stream("KOR:일본입니다. 매우 멋진 단어입니다.")); - assert_eq!(tokens.len(), 11); - } - { - let tokens = test_helper(tokenizer.token_stream("ENG:일본입니다")); - assert_eq!(tokens.len(), 1); - } - } - - #[test] - fn test_multilanguage_tokenizer_with_empty_string() { - let mut tokenizer = MultiLangTokenizer::default(); - { - let tokens = test_helper(tokenizer.token_stream("")); - assert_eq!(tokens.len(), 0); - } - { - let tokens = test_helper(tokenizer.token_stream(" ")); - assert_eq!(tokens.len(), 0); - } - } - - #[test] - fn test_multilanguage_process_language_prefix() { - { - let (lang, text) = get_language_from_prefix("JPN:すもももももももものうち"); - assert_eq!(lang, Some(whichlang::Lang::Jpn)); - assert_eq!(text, "すもももももももものうち"); - } - { - let (lang, text) = get_language_from_prefix("CMN:地址1,包含無效的字元"); - assert_eq!(lang, Some(whichlang::Lang::Cmn)); - assert_eq!(text, "地址1,包含無效的字元"); - } - { - let (lang, text) = get_language_from_prefix("ENG:my address"); - assert_eq!(lang, Some(whichlang::Lang::Eng)); - assert_eq!(text, "my address"); - } - { - let (lang, text) = get_language_from_prefix("UNK:my address"); - assert!(lang.is_none()); - assert_eq!(text, "UNK:my address"); - } - { - let (lang, text) = get_language_from_prefix(""); - assert!(lang.is_none()); - assert_eq!(text, ""); - } - } -} diff --git a/quickwit/quickwit-rest-client/src/models.rs b/quickwit/quickwit-rest-client/src/models.rs index 2857495803f..cfe570058bd 100644 --- a/quickwit/quickwit-rest-client/src/models.rs +++ b/quickwit/quickwit-rest-client/src/models.rs @@ -83,7 +83,6 @@ pub struct SearchResponseRestClient { pub hits: Vec, pub snippets: Option>, pub elapsed_time_micros: u64, - pub errors: Vec, pub aggregations: Option, } diff --git a/quickwit/quickwit-rest-client/src/rest_client.rs b/quickwit/quickwit-rest-client/src/rest_client.rs index 1fb2b5c9812..ab51a3fd1dd 100644 --- a/quickwit/quickwit-rest-client/src/rest_client.rs +++ b/quickwit/quickwit-rest-client/src/rest_client.rs @@ -25,7 +25,7 @@ use quickwit_proto::ingest::Shard; use quickwit_serve::{ ListSplitsQueryParams, ListSplitsResponse, RestIngestResponse, SearchRequestQueryString, }; -use reqwest::header::{CONTENT_TYPE, HeaderMap, HeaderValue}; +use reqwest::header::{CONTENT_TYPE, HeaderMap, HeaderValue, USER_AGENT}; use reqwest::tls::Certificate; use reqwest::{ClientBuilder as ReqwestClientBuilder, Method, StatusCode, Url}; use reqwest_middleware::{ClientBuilder as ReqwestMiddlewareClientBuilder, ClientWithMiddleware}; @@ -112,6 +112,7 @@ impl Transport { } let mut request_headers = HeaderMap::new(); request_headers.insert(CONTENT_TYPE, HeaderValue::from_static(DEFAULT_CONTENT_TYPE)); + request_headers.insert(USER_AGENT, HeaderValue::from_static("qw-rest-client")); if let Some(header_map_val) = header_map { request_headers.extend(header_map_val.into_iter()); } @@ -292,6 +293,10 @@ impl QuickwitClient { ClusterClient::new(&self.transport, self.timeout) } + pub fn maintenance(&self) -> MaintenanceClient<'_> { + MaintenanceClient::new(&self.transport, self.timeout) + } + pub fn node_stats(&self) -> NodeStatsClient<'_> { NodeStatsClient::new(&self.transport, self.timeout) } @@ -780,6 +785,79 @@ impl<'a> NodeHealthClient<'a> { } } +/// Response from the maintenance status endpoint. +#[derive(Debug, serde::Deserialize)] +pub struct MaintenanceStatusResponse { + pub is_maintenance_mode: bool, + pub enabled_at: Option, +} + +/// Response from the enable maintenance endpoint. +#[derive(Debug, serde::Deserialize)] +pub struct EnableMaintenanceResponse { + pub frozen_plan_json: String, +} + +/// Client for maintenance mode APIs. +pub struct MaintenanceClient<'a> { + transport: &'a Transport, + timeout: Timeout, +} + +impl<'a> MaintenanceClient<'a> { + fn new(transport: &'a Transport, timeout: Timeout) -> Self { + Self { transport, timeout } + } + + pub async fn status(&self) -> Result { + let response = self + .transport + .send::<()>( + Method::GET, + "cluster/maintenance", + None, + None, + None, + self.timeout, + ) + .await?; + let status = response.deserialize().await?; + Ok(status) + } + + pub async fn enable(&self) -> Result { + let response = self + .transport + .send::<()>( + Method::PUT, + "cluster/maintenance", + None, + None, + None, + self.timeout, + ) + .await?; + let result = response.deserialize().await?; + Ok(result) + } + + pub async fn disable(&self) -> Result<(), Error> { + let response = self + .transport + .send::<()>( + Method::DELETE, + "cluster/maintenance", + None, + None, + None, + self.timeout, + ) + .await?; + response.check().await?; + Ok(()) + } +} + fn header_from_config_format(config_format: ConfigFormat) -> HeaderMap { let mut header_map = HeaderMap::new(); let content_type_value = format!("application/{}", config_format.as_str()); @@ -842,7 +920,6 @@ mod test { snippets: None, aggregations: None, elapsed_time_micros: 100, - errors: Vec::new(), }; Mock::given(method("POST")) .and(path("/api/v1/my-index/search")) diff --git a/quickwit/quickwit-search/src/client.rs b/quickwit/quickwit-search/src/client.rs index 194bf0b2bd0..628325b2efc 100644 --- a/quickwit/quickwit-search/src/client.rs +++ b/quickwit/quickwit-search/src/client.rs @@ -18,6 +18,7 @@ use std::sync::Arc; use std::time::Duration; use bytesize::ByteSize; +use futures::TryStreamExt; use http::Uri; use quickwit_proto::search::{GetKvRequest, PutKvRequest, ReportSplitsRequest}; use quickwit_proto::tonic::Request; @@ -151,12 +152,24 @@ impl SearchServiceClient { ) -> crate::Result { match &mut self.client_impl { SearchServiceClientImpl::Grpc(grpc_client) => { + let nb_docs_fetched = request.partial_hits.len(); let tonic_request = Request::new(request); - let tonic_response = grpc_client - .fetch_docs(tonic_request) + let all_hits = grpc_client + .stream_fetch_docs(tonic_request) .await - .map_err(|tonic_error| parse_grpc_error(&tonic_error))?; - Ok(tonic_response.into_inner()) + .map_err(|tonic_error| parse_grpc_error(&tonic_error))? + .into_inner() + // TODO stream item errors are all collapsed into SearchError::Internal + .map_err(|tonic_error| parse_grpc_error(&tonic_error)) + .try_fold( + Vec::with_capacity(nb_docs_fetched), + |mut acc, response| async move { + acc.extend(response.hits); + Ok(acc) + }, + ) + .await?; + Ok(quickwit_proto::search::FetchDocsResponse { hits: all_hits }) } SearchServiceClientImpl::Local(service) => service.fetch_docs(request).await, } diff --git a/quickwit/quickwit-search/src/cluster_client.rs b/quickwit/quickwit-search/src/cluster_client.rs index 79f6ba81702..31c98157889 100644 --- a/quickwit/quickwit-search/src/cluster_client.rs +++ b/quickwit/quickwit-search/src/cluster_client.rs @@ -328,6 +328,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }], ..Default::default() } @@ -355,6 +356,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }, SplitIdAndFooterOffsets { split_id: "split_2".to_string(), @@ -363,6 +365,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }, ], }], diff --git a/quickwit/quickwit-search/src/collector.rs b/quickwit/quickwit-search/src/collector.rs index ed21fd968ba..d901ed26071 100644 --- a/quickwit/quickwit-search/src/collector.rs +++ b/quickwit/quickwit-search/src/collector.rs @@ -16,12 +16,15 @@ use std::borrow::Cow; use std::cmp::Ordering; use std::collections::HashSet; -use itertools::Itertools; +use itertools::{Either, Itertools}; use quickwit_common::binary_heap::{SortKeyMapper, TopK}; +use quickwit_common::numeric_types::num_proj::{ + ProjectedNumber, f64_to_i64, f64_to_u64, i64_to_f64, i64_to_u64, u64_to_f64, u64_to_i64, +}; use quickwit_doc_mapper::{FastFieldWarmupInfo, WarmupInfo}; use quickwit_proto::search::{ LeafSearchResponse, PartialHit, ResourceStats, SearchRequest, SortByValue, SortOrder, - SortValue, SplitSearchError, + SortValue, SplitSearchError, TypeSortKey, }; use quickwit_proto::types::SplitId; use serde::Deserialize; @@ -29,13 +32,18 @@ use tantivy::aggregation::agg_req::{Aggregations, get_fast_field_names}; use tantivy::aggregation::intermediate_agg_result::IntermediateAggregationResults; use tantivy::aggregation::{AggContextParams, AggregationLimitsGuard, AggregationSegmentCollector}; use tantivy::collector::{Collector, SegmentCollector}; -use tantivy::columnar::{ColumnType, MonotonicallyMappableToU64}; +use tantivy::columnar::{ + ColumnIndex, ColumnType, MonotonicallyMappableToU64, StrColumn, TermOrdHit, +}; use tantivy::fastfield::Column; use tantivy::tokenizer::TokenizerManager; -use tantivy::{DateTime, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError}; +use tantivy::{ + COLLECT_BLOCK_BUFFER_LEN, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError, +}; use crate::find_trace_ids_collector::{FindTraceIdsCollector, FindTraceIdsSegmentCollector, Span}; -use crate::top_k_collector::{QuickwitSegmentTopKCollector, specialized_top_k_segment_collector}; +use crate::sort_repr::{ElidableU64, InternalSortValueRepr, InternalValueRepr}; +use crate::top_k_collector::QuickwitSegmentTopKCollector; use crate::{GlobalDocAddress, merge_resource_stats, merge_resource_stats_it}; #[derive(Clone, Debug)] @@ -51,30 +59,7 @@ pub(crate) enum SortByComponent { order: SortOrder, }, } -impl From for SortByPair { - fn from(value: SortByComponent) -> Self { - Self { - first: value, - second: None, - } - } -} -#[derive(Clone)] -pub(crate) struct SortByPair { - first: SortByComponent, - second: Option, -} -impl SortByPair { - pub fn sort_orders(&self) -> (SortOrder, SortOrder) { - ( - self.first.sort_order(), - self.second - .as_ref() - .map(|sort_by| sort_by.sort_order()) - .unwrap_or(SortOrder::Desc), - ) - } -} + impl SortByComponent { fn to_sorting_field_extractor_component( &self, @@ -83,19 +68,48 @@ impl SortByComponent { match self { SortByComponent::DocId { .. } => Ok(SortingFieldExtractorComponent::DocId), SortByComponent::FastField { field_name, .. } => { - let sort_column_opt: Option<(Column, ColumnType)> = - segment_reader.fast_fields().u64_lenient(field_name)?; - let (sort_column, column_type) = sort_column_opt.unwrap_or_else(|| { - ( - Column::build_empty_column(segment_reader.max_doc()), - ColumnType::U64, - ) - }); - let sort_field_type = SortFieldType::try_from(column_type)?; - Ok(SortingFieldExtractorComponent::FastField { - sort_column, - sort_field_type, - }) + let allowed_column_types = [ + ColumnType::I64, + ColumnType::U64, + ColumnType::F64, + ColumnType::Str, + ColumnType::DateTime, + ColumnType::Bool, + // ColumnType::IpAddr Unsupported + // ColumnType::Bytes Unsupported + ]; + let fast_fields = segment_reader.fast_fields(); + let mut sort_columns = fast_fields + .u64_lenient_for_type_all(Some(&allowed_column_types), field_name)? + .into_iter() + .map(|(col, col_typ)| match col_typ { + ColumnType::U64 => Ok((col, SortFieldType::U64)), + ColumnType::I64 => Ok((col, SortFieldType::I64)), + ColumnType::F64 => Ok((col, SortFieldType::F64)), + ColumnType::DateTime => Ok((col, SortFieldType::DateTime)), + ColumnType::Bool => Ok((col, SortFieldType::Bool)), + ColumnType::Str => Ok(( + col, + SortFieldType::String( + fast_fields + .str(field_name)? + .expect("field with str column type should have str column"), + ), + )), + _ => panic!("unsupported"), + }) + .collect::>>()?; + + sort_columns.sort_by_key(|(_, col_typ)| col_typ.type_sort_key()); + + // TODO we could skip the columns that are before the search after + + Ok(SortingFieldExtractorComponent::FastField( + FastFieldExtractor { + sort_columns, + col_scratch: Box::new([None; COLLECT_BLOCK_BUFFER_LEN]), + }, + )) } SortByComponent::Score { .. } => Ok(SortingFieldExtractorComponent::Score), } @@ -125,347 +139,568 @@ impl SortByComponent { } } -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Clone)] +pub(crate) struct SortByPair { + first: SortByComponent, + second: Option, +} +impl SortByPair { + pub fn sort_orders(&self) -> (SortOrder, SortOrder) { + ( + self.first.sort_order(), + self.second + .as_ref() + .map(|sort_by| sort_by.sort_order()) + .unwrap_or(SortOrder::Desc), + ) + } +} + +#[derive(Clone, Debug)] pub(crate) enum SortFieldType { U64, I64, F64, DateTime, Bool, + String(StrColumn), +} + +impl SortFieldType { + fn type_sort_key(&self) -> TypeSortKey { + match self { + SortFieldType::U64 => TypeSortKey::Numeric, + SortFieldType::I64 => TypeSortKey::Numeric, + SortFieldType::F64 => TypeSortKey::Numeric, + SortFieldType::DateTime => TypeSortKey::DateTime, + SortFieldType::Bool => TypeSortKey::Boolean, + SortFieldType::String(_) => TypeSortKey::Str, + } + } +} + +struct FastFieldExtractor { + /// Sort columns are sorted in the same order as types (TypeSortKey) + sort_columns: Vec<(Column, SortFieldType)>, + col_scratch: Box<[Option; COLLECT_BLOCK_BUFFER_LEN]>, +} + +impl FastFieldExtractor { + fn fill_batch( + &mut self, + docs: &[DocId], + order: SortOrder, + out: &mut [InternalValueRepr], + ) { + let n = docs.len(); + let unique_column = &self.sort_columns[0].0; + if let ColumnIndex::Multivalued(_) = unique_column.index { + // TODO: first_vals() doesn't enforce zeroing for multivalued + // columns. It seems like something that should be fixed in Tantivy? + self.col_scratch[..n].fill(None); + } + self.sort_columns[0] + .0 + .first_vals(docs, &mut self.col_scratch[..n]); + for (repr, val_opt) in out[..n].iter_mut().zip(self.col_scratch[..n].iter()) { + *repr = match val_opt { + Some(val) => InternalValueRepr::new(*val, 0, order), + None => InternalValueRepr::new_missing(), + }; + } + } } /// The `SortingFieldExtractor` is used to extract a score, which can either be a true score, /// a value from a fast field, or nothing (sort by DocId). -pub(crate) enum SortingFieldExtractorComponent { +enum SortingFieldExtractorComponent { /// If undefined, we simply sort by DocIds. DocId, - FastField { - sort_column: Column, - sort_field_type: SortFieldType, - }, + FastField(FastFieldExtractor), Score, } impl SortingFieldExtractorComponent { - pub fn is_score(&self) -> bool { - matches!(self, SortingFieldExtractorComponent::Score) + pub fn is_doc_id(&self) -> bool { + matches!(self, SortingFieldExtractorComponent::DocId) } - pub fn is_fast_field(&self) -> bool { - matches!(self, SortingFieldExtractorComponent::FastField { .. }) - } - /// Loads the fast field values for the given doc_ids in its u64 representation. The returned - /// u64 representation maintains the ordering of the original value. - #[inline] - pub fn extract_typed_sort_values_block(&self, doc_ids: &[DocId], values: &mut [Option]) { - // In the collect block case we don't have scores to extract - if let SortingFieldExtractorComponent::FastField { sort_column, .. } = self { - let values = &mut values[..doc_ids.len()]; - sort_column.first_vals(doc_ids, values); + + /// Currently batch extraction only has a fast path for full columns. That + /// can only happen if there is only one column for the fast field. + fn extractor_for_batch_if_worthwhile(&mut self) -> Option<&mut FastFieldExtractor> { + match self { + SortingFieldExtractorComponent::FastField(extractor) + if extractor.sort_columns.len() == 1 => + { + Some(extractor) + } + _ => None, } } - /// Returns the sort value for the given element in its u64 representation. The returned u64 - /// representation maintains the ordering of the original value. - /// - /// The function returns None if the sort key is a fast field, for which we have no value - /// for the given doc_id, or we sort by DocId. + /// Returns the sort value for the given element in its u64 representation. + /// The returned u64 representation maintains the ordering of the original + /// value. #[inline] - fn extract_typed_sort_value_opt(&self, doc_id: DocId, score: Score) -> Option { + fn project_to_internal_sort_value( + &self, + doc_id: DocId, + score: Score, + order: SortOrder, + ) -> InternalValueRepr { match self { - // Tie breaks are not handled here, but in SegmentPartialHit - SortingFieldExtractorComponent::DocId => None, - SortingFieldExtractorComponent::FastField { sort_column, .. } => { - sort_column.first(doc_id) + SortingFieldExtractorComponent::DocId => { + // Doc id is handled at the compound sort value level + debug_assert!(V::is_elided()); + InternalValueRepr::new_missing() + } + SortingFieldExtractorComponent::FastField(FastFieldExtractor { + sort_columns, .. + }) => { + for (idx, (sort_column, _)) in sort_columns.iter().enumerate() { + if let Some(value) = sort_column.first(doc_id) { + return InternalValueRepr::new(value, idx as u8, order); + } + } + InternalValueRepr::new_missing() + } + SortingFieldExtractorComponent::Score => { + InternalValueRepr::new((score as f64).to_u64(), 0, order) } - SortingFieldExtractorComponent::Score => Some((score as f64).to_u64()), } } - #[inline] - /// Converts u64 fast field values to its correct type. - /// The conversion is delayed for performance reasons. - /// - /// This is used to convert `search_after` sort value to a u64 representation that will respect - /// the same order as the `SortValue` representation. - pub fn convert_u64_ff_val_to_sort_value(&self, sort_value: u64) -> SortValue { - let map_fast_field_to_value = |fast_field_value, field_type| match field_type { - SortFieldType::U64 => SortValue::U64(fast_field_value), - SortFieldType::I64 => SortValue::I64(i64::from_u64(fast_field_value)), - SortFieldType::F64 => SortValue::F64(f64::from_u64(fast_field_value)), - SortFieldType::DateTime => SortValue::I64(i64::from_u64(fast_field_value)), - SortFieldType::Bool => SortValue::Boolean(fast_field_value != 0u64), - }; - match self { - SortingFieldExtractorComponent::DocId => SortValue::U64(sort_value), - SortingFieldExtractorComponent::FastField { - sort_field_type, .. - } => map_fast_field_to_value(sort_value, *sort_field_type), - SortingFieldExtractorComponent::Score => SortValue::F64(f64::from_u64(sort_value)), + fn project_from_internal_sort_value( + &self, + internal_repr: InternalValueRepr, + order: SortOrder, + ) -> tantivy::Result> { + if V::is_elided() { + return Ok(None); } + let Some((col_idx, val_as_u64)) = internal_repr.decode(order) else { + return Ok(Some(SortByValue { sort_value: None })); + }; + let sort_value = match self { + SortingFieldExtractorComponent::FastField(FastFieldExtractor { + sort_columns, .. + }) => { + let (_, field_type) = &sort_columns[col_idx as usize]; + match field_type { + SortFieldType::U64 => SortValue::U64(val_as_u64), + SortFieldType::I64 => SortValue::I64(i64::from_u64(val_as_u64)), + SortFieldType::F64 => SortValue::F64(f64::from_u64(val_as_u64)), + SortFieldType::DateTime => SortValue::Datetime(i64::from_u64(val_as_u64)), + SortFieldType::Bool => SortValue::Boolean(val_as_u64 != 0u64), + SortFieldType::String(str_column) => { + let term_dict = str_column.dictionary(); + let mut buffer = Vec::new(); + term_dict.ord_to_term(val_as_u64, &mut buffer)?; + let string_value = String::from_utf8(buffer).map_err(|_| { + tantivy::TantivyError::InternalError( + "term dictionary contains non-UTF-8 bytes".to_string(), + ) + })?; + SortValue::Str(string_value) + } + } + } + SortingFieldExtractorComponent::Score => SortValue::F64(f64::from_u64(val_as_u64)), + SortingFieldExtractorComponent::DocId => { + return Err(tantivy::TantivyError::InternalError( + "value should be elided on doc id sort".to_string(), + )); + } + }; + Ok(Some(SortByValue { + sort_value: Some(sort_value), + })) } - /// Converts fast field values into their u64 fast field representation. - /// - /// Returns None if value is out of bounds of target value. - /// None means that the search_after will be disabled and everything matches. - /// - /// What's currently missing is to signal that _nothing_ matches to generate an optimized - /// query. For now we just choose the max value of the target type. - #[inline] - pub fn convert_to_u64_ff_val( + + fn project_to_internal_search_after( &self, - sort_value: SortValue, + sort_by_value: &SortByValue, sort_order: SortOrder, - ) -> Option { - match self { - SortingFieldExtractorComponent::DocId => match sort_value { - SortValue::U64(val) => Some(val), - _ => panic!("Internal error: Got non-U64 sort value for DocId."), - }, - SortingFieldExtractorComponent::FastField { - sort_field_type, .. - } => { - // We need to convert a (potential user provided) value in the correct u64 - // representation of the fast field. - // This requires this weird conversion of first casting into the target type - // (if possible) and then to its u64 presentation. - // - // For the conversion into the target type it's important to know if the target - // type does not cover the whole range of the source type. In that case we need to - // add additional conversion checks, to see if it matches everything - // or nothing. (Which also depends on the sort order). - // Below are the visual representations of the value ranges of the different types. - // Note: DateTime is equal to I64 and omitted. - // - // Bool value range (0, 1): - // <-> - // - // I64 value range (signed 64-bit integer): - // <------------------------------------> - // -2^63 2^63-1 - // U64 value range (unsigned 64-bit integer): - // <------------------------------------> - // 0 2^64-1 - // F64 value range (64-bit floating point, conceptual, not to scale): - // <--------------------------------------------------------------------> - // Very negative numbers Very positive numbers - // - // Those conversions have limited target type value space: - // - [X] U64 -> I64 - // - [X] F64 -> I64 - // - [X] I64 -> U64 - // - [X] F64 -> U64 - // - // - [X] F64 -> Bool - // - [X] I64 -> Bool - // - [X] U64 -> Bool - // - let val = match (sort_value, sort_field_type) { - // Same field type, no conversion needed. - (SortValue::U64(val), SortFieldType::U64) => val, - (SortValue::F64(val), SortFieldType::F64) => val.to_u64(), - (SortValue::Boolean(val), SortFieldType::Bool) => val.to_u64(), - (SortValue::I64(val), SortFieldType::I64) => val.to_u64(), - (SortValue::U64(mut val), SortFieldType::I64) => { - if sort_order == SortOrder::Desc && val > i64::MAX as u64 { - return None; - } - // Add a limit to avoid overflow. - val = val.min(i64::MAX as u64); - (val as i64).to_u64() - } - (SortValue::U64(val), SortFieldType::F64) => (val as f64).to_u64(), - (SortValue::U64(mut val), SortFieldType::DateTime) => { - // Match everything - if sort_order == SortOrder::Desc && val > i64::MAX as u64 { - return None; - } - // Add a limit to avoid overflow. - val = val.min(i64::MAX as u64); - DateTime::from_timestamp_nanos(val as i64).to_u64() - } - (SortValue::I64(val), SortFieldType::U64) => { - if val < 0 && sort_order == SortOrder::Asc { - return None; - } - if val < 0 && sort_order == SortOrder::Desc { - u64::MIN // matches nothing as search_after is not inclusive - } else { - val as u64 - } - } - (SortValue::I64(val), SortFieldType::F64) => (val as f64).to_u64(), - (SortValue::I64(val), SortFieldType::DateTime) => { - DateTime::from_timestamp_nanos(val).to_u64() - } - (SortValue::F64(val), SortFieldType::U64) => { - let all_values_ahead1 = - val < u64::MIN as f64 && sort_order == SortOrder::Asc; - let all_values_ahead2 = - val > u64::MAX as f64 && sort_order == SortOrder::Desc; - if all_values_ahead1 || all_values_ahead2 { - return None; - } - // f64 cast already handles under/overflow and clamps the value - (val as u64).to_u64() - } - (SortValue::F64(val), SortFieldType::I64) - | (SortValue::F64(val), SortFieldType::DateTime) => { - let all_values_ahead1 = - val < i64::MIN as f64 && sort_order == SortOrder::Asc; - let all_values_ahead2 = - val > i64::MAX as f64 && sort_order == SortOrder::Desc; - if all_values_ahead1 || all_values_ahead2 { - return None; - } - // f64 cast already handles under/overflow and clamps the value - let val_i64 = val as i64; + ) -> tantivy::Result> { + let SortByValue { + sort_value: sort_value_opt, + } = sort_by_value; + match (self, sort_value_opt) { + (SortingFieldExtractorComponent::DocId, _) => { + // Doc id sorts are handled at the compound sort value level + debug_assert!(V::is_elided()); + Ok(InternalValueRepr::new_missing()) + } + (SortingFieldExtractorComponent::FastField(_), None) => { + Ok(InternalValueRepr::new_missing()) + } + ( + SortingFieldExtractorComponent::FastField(FastFieldExtractor { + sort_columns, .. + }), + Some(sort_value), + ) => project_search_after_sort_value(sort_columns, sort_value, sort_order), + (SortingFieldExtractorComponent::Score, Some(SortValue::F64(val))) => { + Ok(InternalValueRepr::new(val.to_u64(), 0, sort_order)) + } + (SortingFieldExtractorComponent::Score, _) => { + Err(tantivy::TantivyError::InvalidArgument( + "got non-F64 sort value for score".to_string(), + )) + } + } + } +} - if *sort_field_type == SortFieldType::DateTime { - DateTime::from_timestamp_nanos(val_i64).to_u64() - } else { - val_i64.to_u64() - } - } - // Not sure when we hit this, it's probably are very rare case. - (SortValue::Boolean(val), SortFieldType::U64) => val as u64, - (SortValue::Boolean(val), SortFieldType::F64) => (val as u64 as f64).to_u64(), - (SortValue::Boolean(val), SortFieldType::I64) => (val as i64).to_u64(), - (SortValue::Boolean(val), SortFieldType::DateTime) => { - DateTime::from_timestamp_nanos(val as i64).to_u64() +fn projected_number_internal_repr( + projected: ProjectedNumber, + order: SortOrder, + accessor_idx: u8, +) -> InternalValueRepr { + match (projected, order) { + (ProjectedNumber::Exact(val), _) => { + InternalValueRepr::new(val.to_u64(), accessor_idx, order) + } + (ProjectedNumber::AfterLast, SortOrder::Asc) => { + InternalValueRepr::new_skip_column(accessor_idx, order) + } + (ProjectedNumber::AfterLast, SortOrder::Desc) => { + InternalValueRepr::new_keep_column(accessor_idx, order) + } + (ProjectedNumber::Next(val), SortOrder::Asc) => { + let val_u64 = val.to_u64(); + if val_u64 == 0 { + InternalValueRepr::new_keep_column(accessor_idx, order) + } else { + InternalValueRepr::new(val_u64 - 1, accessor_idx, order) + } + } + (ProjectedNumber::Next(val), SortOrder::Desc) => { + let val_u64 = val.to_u64(); + if val_u64 == 0 { + InternalValueRepr::new_skip_column(accessor_idx, order) + } else { + InternalValueRepr::new(val_u64, accessor_idx, order) + } + } + } +} + +fn project_search_after_sort_value( + sort_columns: &[(Column, SortFieldType)], + sort_value: &SortValue, + sort_order: SortOrder, +) -> tantivy::Result> { + let col_iter = match sort_order { + SortOrder::Asc => Either::Left(sort_columns.iter().enumerate()), + SortOrder::Desc => Either::Right(sort_columns.iter().enumerate().rev()), + }; + for (idx, sort_column) in col_iter { + let internal_repr = match (&sort_column.1, sort_value) { + // project to u64 column + (SortFieldType::U64, SortValue::U64(val)) => { + InternalValueRepr::new(*val, idx as u8, sort_order) + } + (SortFieldType::U64, SortValue::F64(val)) => { + projected_number_internal_repr(f64_to_u64(*val), sort_order, idx as u8) + } + (SortFieldType::U64, SortValue::I64(val)) => { + projected_number_internal_repr(i64_to_u64(*val), sort_order, idx as u8) + } + // project to i64 column + (SortFieldType::I64, SortValue::I64(val)) => { + InternalValueRepr::new(val.to_u64(), idx as u8, sort_order) + } + (SortFieldType::I64, SortValue::F64(val)) => { + projected_number_internal_repr(f64_to_i64(*val), sort_order, idx as u8) + } + (SortFieldType::I64, SortValue::U64(val)) => { + projected_number_internal_repr(u64_to_i64(*val), sort_order, idx as u8) + } + // project to f64 column + (SortFieldType::F64, SortValue::F64(val)) => { + InternalValueRepr::new(val.to_u64(), idx as u8, sort_order) + } + (SortFieldType::F64, SortValue::I64(val)) => { + projected_number_internal_repr(i64_to_f64(*val), sort_order, idx as u8) + } + (SortFieldType::F64, SortValue::U64(val)) => { + projected_number_internal_repr(u64_to_f64(*val), sort_order, idx as u8) + } + // other types + (SortFieldType::DateTime, SortValue::Datetime(val)) => { + InternalValueRepr::new(val.to_u64(), idx as u8, sort_order) + } + (SortFieldType::Bool, SortValue::Boolean(val)) => { + InternalValueRepr::new(val.to_u64(), idx as u8, sort_order) + } + (SortFieldType::String(str_column), SortValue::Str(val)) => { + let term_dict = str_column.dictionary(); + let hit = term_dict.term_ord_or_next(val.as_str().as_bytes())?; + match (hit, sort_order) { + (TermOrdHit::Exact(ord), _) => { + InternalValueRepr::new(ord, idx as u8, sort_order) } - (SortValue::U64(mut val), SortFieldType::Bool) => { - let all_values_ahead1 = val > 1 && sort_order == SortOrder::Desc; - if all_values_ahead1 { - return None; - } - // clamp value for comparison - val = val.clamp(0, 1); - (val == 1).to_u64() + (TermOrdHit::Next(ord), SortOrder::Desc) => { + InternalValueRepr::new(ord, idx as u8, sort_order) } - (SortValue::I64(mut val), SortFieldType::Bool) => { - let all_values_ahead1 = val > 1 && sort_order == SortOrder::Desc; - let all_values_ahead2 = val < 0 && sort_order == SortOrder::Asc; - if all_values_ahead1 || all_values_ahead2 { - return None; - } - // clamp value for comparison - val = val.clamp(0, 1); - (val == 1).to_u64() + (TermOrdHit::Next(0), SortOrder::Asc) => { + InternalValueRepr::new_keep_column(idx as u8, sort_order) } - (SortValue::F64(mut val), SortFieldType::Bool) => { - let all_values_ahead1 = val > 1.0 && sort_order == SortOrder::Desc; - let all_values_ahead2 = val < 0.0 && sort_order == SortOrder::Asc; - if all_values_ahead1 || all_values_ahead2 { - return None; - } - val = val.clamp(0.0, 1.0); - (val >= 0.5).to_u64() // Is this correct? + (TermOrdHit::Next(ord), SortOrder::Asc) => { + InternalValueRepr::new(ord - 1, idx as u8, sort_order) } + } + } + // unsupported mixed types + // + // TODO: we need a strongly typed pagination API to support JSON + // fields with datetime and schema evolutions + ( + SortFieldType::I64 | SortFieldType::U64 | SortFieldType::F64, + SortValue::Datetime(_), + ) => { + return Err(TantivyError::SchemaError( + "search after not supported for schema updates to datetime".to_string(), + )); + } + ( + SortFieldType::DateTime, + SortValue::I64(_) | SortValue::U64(_) | SortValue::F64(_), + ) => { + return Err(TantivyError::SchemaError( + "search after not supported on multi-typed fields with datetime".to_string(), + )); + } + // supported mixed types + (sort_field_type, sort_value) => { + let column_key = sort_field_type.type_sort_key(); + let value_key = sort_value.type_sort_key(); + debug_assert_ne!(column_key, value_key); + let column_comes_after = match sort_order { + SortOrder::Desc => column_key < value_key, + SortOrder::Asc => column_key > value_key, }; - Some(val) + if column_comes_after { + InternalValueRepr::new_keep_column(idx as u8, sort_order) + } else { + continue; + } } - SortingFieldExtractorComponent::Score => match sort_value { - SortValue::F64(val) => Some(val.to_u64()), - _ => panic!("Internal error: Got non-F64 sort value for Score."), - }, - } + }; + return Ok(internal_repr); } + Ok(InternalValueRepr::new_skip_all_but_missing()) } -impl From for SortingFieldExtractorPair { - fn from(value: SortingFieldExtractorComponent) -> Self { - Self { - first: value, - second: None, - } - } +pub(crate) struct SortingFieldExtractorPair { + first: SortingFieldExtractorComponent, + second: Option, + first_order: SortOrder, + second_order: SortOrder, + sort1_scratch: Box<[InternalValueRepr; COLLECT_BLOCK_BUFFER_LEN]>, + sort2_scratch: Box<[InternalValueRepr; COLLECT_BLOCK_BUFFER_LEN]>, } -pub(crate) struct SortingFieldExtractorPair { - pub first: SortingFieldExtractorComponent, - pub second: Option, -} +impl SortingFieldExtractorPair { + fn doc_id_sort_order(&self) -> SortOrder { + if self.first.is_doc_id() { + self.first_order + } else if let Some(second) = &self.second + && second.is_doc_id() + { + self.second_order + } else { + // TODO this is the current behavior which is weird. QW docs for the + // native search API advertise that the sort order by default is + // reverse(doc_id). In ES _shard_doc is supposed to be always ascending. + self.first_order + } + } -impl SortingFieldExtractorPair { - pub fn is_score(&self) -> bool { - self.first.is_score() - || self - .second + pub(crate) fn search_after_from_partial_hit( + &self, + split_id: &SplitId, + segment_ord: SegmentOrdinal, + partial_hit: &PartialHit, + ) -> tantivy::Result> { + let sort_1 = if let Some(sort_by_value) = &partial_hit.sort_value { + self.first + .project_to_internal_search_after(sort_by_value, self.first_order)? + } else { + InternalValueRepr::new_missing() + }; + let sort_2 = if let Some(sort_by_value) = &partial_hit.sort_value2 { + self.second .as_ref() - .map(|second| second.is_score()) - .unwrap_or(false) + .ok_or_else(|| { + TantivyError::InvalidArgument( + "search after has 2 values but there is only 1 sort dimension".to_string(), + ) + })? + .project_to_internal_search_after(sort_by_value, self.second_order)? + } else { + InternalValueRepr::new_missing() + }; + + let internal_repr = if partial_hit.split_id.is_empty() { + // When split_id is empty, the search_after is a pure sort-value + // boundary (no doc position), any doc with the same sort value must be + // excluded otherwise we risk iterating over an over through the same + // documents. + InternalSortValueRepr::new_skip_doc_ids(sort_1, sort_2) + } else { + let split_cmp = split_id + .as_str() + .cmp(partial_hit.split_id.as_str()) + .then(segment_ord.cmp(&partial_hit.segment_ord)); + match (split_cmp, self.doc_id_sort_order()) { + (Ordering::Less, SortOrder::Asc) | (Ordering::Greater, SortOrder::Desc) => { + InternalSortValueRepr::new_skip_doc_ids(sort_1, sort_2) + } + (Ordering::Less, SortOrder::Desc) | (Ordering::Greater, SortOrder::Asc) => { + InternalSortValueRepr::new_keep_doc_ids(sort_1, sort_2) + } + (Ordering::Equal, doc_id_order) => { + InternalSortValueRepr::new(sort_1, sort_2, partial_hit.doc_id, doc_id_order) + } + } + }; + Ok(internal_repr) } - /// Returns the list of sort values for the given element - /// - /// See also [`SortingFieldExtractorComponent::extract_typed_sort_values_block`] for more - /// information. - #[inline] - pub(crate) fn extract_typed_sort_values( + + pub(crate) fn internal_to_partial_hit( &self, - doc_ids: &[DocId], - values1: &mut [Option], - values2: &mut [Option], - ) { - self.first - .extract_typed_sort_values_block(doc_ids, &mut values1[..doc_ids.len()]); - if let Some(second) = self.second.as_ref() { - second.extract_typed_sort_values_block(doc_ids, &mut values2[..doc_ids.len()]); - } + split_id: &SplitId, + segment_ord: SegmentOrdinal, + internal_repr: InternalSortValueRepr, + ) -> tantivy::Result { + let sort_1 = self + .first + .project_from_internal_sort_value(internal_repr.sort_1(), self.first_order)?; + let sort_2 = self + .second + .as_ref() + .map(|second| { + second.project_from_internal_sort_value(internal_repr.sort_2(), self.second_order) + }) + .transpose()? + .unwrap_or_default(); + Ok(PartialHit { + sort_value: sort_1, + sort_value2: sort_2, + doc_id: internal_repr.doc_id(self.doc_id_sort_order()), + split_id: split_id.clone(), + segment_ord, + }) } + /// Returns the list of sort values for the given element /// /// See also [`SortingFieldExtractorComponent::extract_typed_sort_value_opt`] for more /// information. #[inline] - pub(crate) fn extract_typed_sort_value( + pub(crate) fn project_to_internal_sort_value( &self, doc_id: DocId, score: Score, - ) -> (Option, Option) { - let first = self.first.extract_typed_sort_value_opt(doc_id, score); + ) -> InternalSortValueRepr { + let first = self + .first + .project_to_internal_sort_value(doc_id, score, self.first_order); let second = self .second .as_ref() - .and_then(|second| second.extract_typed_sort_value_opt(doc_id, score)); - (first, second) + .map(|second| second.project_to_internal_sort_value(doc_id, score, self.second_order)) + .unwrap_or_else(InternalValueRepr::new_missing); + InternalSortValueRepr::new(first, second, doc_id, self.doc_id_sort_order()) } -} -impl TryFrom for SortFieldType { - type Error = tantivy::TantivyError; - - fn try_from(column_type: ColumnType) -> tantivy::Result { - match column_type { - ColumnType::U64 => Ok(SortFieldType::U64), - ColumnType::I64 => Ok(SortFieldType::I64), - ColumnType::F64 => Ok(SortFieldType::F64), - ColumnType::DateTime => Ok(SortFieldType::DateTime), - ColumnType::Bool => Ok(SortFieldType::Bool), - _ => Err(TantivyError::InvalidArgument(format!( - "Unsupported sort field type `{column_type:?}`." - ))), + pub(crate) fn project_to_internal_sort_value_block( + &mut self, + docs: &[DocId], + mut f: impl FnMut(InternalSortValueRepr), + ) { + let doc_id_order = self.doc_id_sort_order(); + let first_order = self.first_order; + let second_order = self.second_order; + + let n = docs.len(); + + let SortingFieldExtractorPair { + first, + second, + sort1_scratch, + sort2_scratch, + .. + } = self; + + let first_extractor_opt = first.extractor_for_batch_if_worthwhile(); + let second_extractor_opt = second + .as_mut() + .and_then(|s| s.extractor_for_batch_if_worthwhile()); + match (first_extractor_opt, second_extractor_opt) { + (Some(fst_batch_extr), Some(sec_batch_extr)) => { + fst_batch_extr.fill_batch(docs, first_order, &mut sort1_scratch[..n]); + sec_batch_extr.fill_batch(docs, second_order, &mut sort2_scratch[..n]); + for i in 0..n { + f(InternalSortValueRepr::new( + sort1_scratch[i], + sort2_scratch[i], + docs[i], + doc_id_order, + )); + } + } + (Some(fst_batch_extr), None) => { + fst_batch_extr.fill_batch(docs, first_order, &mut sort1_scratch[..n]); + for i in 0..n { + let sort2 = second + .as_ref() + .map(|s| s.project_to_internal_sort_value(docs[i], 0.0, second_order)) + .unwrap_or_else(InternalValueRepr::new_missing); + f(InternalSortValueRepr::new( + sort1_scratch[i], + sort2, + docs[i], + doc_id_order, + )); + } + } + (None, Some(sec_batch_extr)) => { + sec_batch_extr.fill_batch(docs, second_order, &mut sort2_scratch[..n]); + for i in 0..n { + let sort1 = first.project_to_internal_sort_value(docs[i], 0.0, first_order); + f(InternalSortValueRepr::new( + sort1, + sort2_scratch[i], + docs[i], + doc_id_order, + )); + } + } + (None, None) => { + for &doc_id in docs { + let first = self + .first + .project_to_internal_sort_value(doc_id, 0.0, first_order); + let second = self + .second + .as_ref() + .map(|s| s.project_to_internal_sort_value(doc_id, 0.0, second_order)) + .unwrap_or_else(InternalValueRepr::new_missing); + f(InternalSortValueRepr::new( + first, + second, + doc_id, + doc_id_order, + )); + } + } } } } -/// Takes a user-defined sorting criteria and resolves it to a -/// segment specific `SortingFieldExtractorPair`. -fn get_score_extractor( - sort_by: &SortByPair, - segment_reader: &SegmentReader, -) -> tantivy::Result { - Ok(SortingFieldExtractorPair { - first: sort_by - .first - .to_sorting_field_extractor_component(segment_reader)?, - second: sort_by - .second - .as_ref() - .map(|first| first.to_sorting_field_extractor_component(segment_reader)) - .transpose()?, - }) -} - #[allow(clippy::large_enum_variant)] enum AggregationSegmentCollectors { FindTraceIdsSegmentCollector(Box), @@ -474,51 +709,50 @@ enum AggregationSegmentCollectors { /// Quickwit collector working at the scale of the segment. pub struct QuickwitSegmentCollector { - segment_top_k_collector: Option>, + segment_top_k_collector: Option, aggregation: Option, num_hits: u64, } -#[derive(Copy, Clone, Debug)] -pub(crate) struct SegmentPartialHit { - /// Normalized to u64, the typed value can be reconstructed with - /// SortingFieldExtractorComponent. - pub sort_value: Option, - pub sort_value2: Option, - pub doc_id: DocId, -} - -impl SegmentPartialHit { - pub fn into_partial_hit( - self, - split_id: SplitId, - segment_ord: SegmentOrdinal, - first: &SortingFieldExtractorComponent, - second: &Option, - ) -> PartialHit { - PartialHit { - sort_value: self - .sort_value - .map(|sort_value| first.convert_u64_ff_val_to_sort_value(sort_value)) - .map(|sort_value| SortByValue { - sort_value: Some(sort_value), - }), - sort_value2: self - .sort_value2 - .map(|sort_value| { - second - .as_ref() - .expect("Internal error: Got sort_value2, but no sort extractor") - .convert_u64_ff_val_to_sort_value(sort_value) - }) - .map(|sort_value| SortByValue { - sort_value: Some(sort_value), - }), - doc_id: self.doc_id, - split_id, - segment_ord, - } - } +/// Takes a user-defined sorting criteria and resolves it to a +/// segment specific `SortingFieldExtractorPair`. +#[allow(clippy::type_complexity)] +fn get_sorting_field_extractors( + sort_by: &SortByPair, + segment_reader: &SegmentReader, + split_id: &SplitId, + segment_ord: SegmentOrdinal, + search_after: &Option, +) -> tantivy::Result<( + SortingFieldExtractorPair, + Option>, +)> { + let extractor = SortingFieldExtractorPair { + first: sort_by + .first + .to_sorting_field_extractor_component(segment_reader)?, + second: sort_by + .second + .as_ref() + .map(|first| first.to_sorting_field_extractor_component(segment_reader)) + .transpose()?, + first_order: sort_by.first.sort_order(), + second_order: sort_by + .second + .as_ref() + .map(|second| second.sort_order()) + // value irrelevant? + .unwrap_or(SortOrder::Desc), + sort1_scratch: Box::new([InternalValueRepr::new_missing(); COLLECT_BLOCK_BUFFER_LEN]), + sort2_scratch: Box::new([InternalValueRepr::new_missing(); COLLECT_BLOCK_BUFFER_LEN]), + }; + let search_after_opt = search_after + .as_ref() + .map(|search_after| { + extractor.search_after_from_partial_hit(split_id, segment_ord, search_after) + }) + .transpose()?; + Ok((extractor, search_after_opt)) } impl SegmentCollector for QuickwitSegmentCollector { @@ -526,7 +760,6 @@ impl SegmentCollector for QuickwitSegmentCollector { #[inline] fn collect_block(&mut self, filtered_docs: &[DocId]) { - // Update results self.num_hits += filtered_docs.len() as u64; if let Some(segment_top_k_collector) = self.segment_top_k_collector.as_mut() { @@ -565,7 +798,7 @@ impl SegmentCollector for QuickwitSegmentCollector { fn harvest(self) -> Self::Fruit { let mut partial_hits: Vec = Vec::new(); if let Some(segment_top_k_collector) = self.segment_top_k_collector { - partial_hits = segment_top_k_collector.get_top_k(); + partial_hits = segment_top_k_collector.get_top_k()?; } let intermediate_aggregation_result = match self.aggregation { @@ -668,7 +901,7 @@ impl QuickwitIncrementalAggregations { let timestamp = last_elem.span_timestamp.into_timestamp_nanos(); return Some(PartialHit { sort_value: Some(SortByValue { - sort_value: Some(SortValue::I64(timestamp)), + sort_value: Some(SortValue::Datetime(timestamp)), }), sort_value2: None, split_id: SplitId::new(), @@ -792,22 +1025,70 @@ impl Collector for QuickwitCollector { ), None => None, }; - let score_extractor = get_score_extractor(&self.sort_by, segment_reader)?; - let (order1, order2) = self.sort_by.sort_orders(); let segment_top_k_collector = if leaf_max_hits == 0 { None } else { - let coll: Box = specialized_top_k_segment_collector( - self.split_id.clone(), - score_extractor, - leaf_max_hits, - segment_ord, - self.search_after.clone(), - order1, - order2, - ); - Some(coll) + let segment_top_k_collector = match self.sort_by { + SortByPair { + first: SortByComponent::DocId { .. }, + second: None, + } => { + let (extractor, search_after_opt) = get_sorting_field_extractors( + &self.sort_by, + segment_reader, + &self.split_id, + segment_ord, + &self.search_after, + )?; + QuickwitSegmentTopKCollector::new_with_doc_id_sort( + self.split_id.clone(), + segment_ord, + extractor, + leaf_max_hits, + search_after_opt, + ) + } + SortByPair { + first: _, + second: None | Some(SortByComponent::DocId { .. }), + } => { + let (extractor, search_after_opt) = get_sorting_field_extractors( + &self.sort_by, + segment_reader, + &self.split_id, + segment_ord, + &self.search_after, + )?; + QuickwitSegmentTopKCollector::new_with_one_dim_sort( + self.split_id.clone(), + segment_ord, + extractor, + leaf_max_hits, + search_after_opt, + ) + } + SortByPair { + first: _, + second: Some(_), + } => { + let (extractor, search_after_opt) = get_sorting_field_extractors( + &self.sort_by, + segment_reader, + &self.split_id, + segment_ord, + &self.search_after, + )?; + QuickwitSegmentTopKCollector::new_with_two_dim_sort( + self.split_id.clone(), + segment_ord, + extractor, + leaf_max_hits, + search_after_opt, + ) + } + }; + Some(segment_top_k_collector) }; Ok(QuickwitSegmentCollector { @@ -1008,14 +1289,20 @@ pub(crate) fn sort_by_from_request(search_request: &SearchRequest) -> SortByPair let num_sort_fields = search_request.sort_fields.len(); if num_sort_fields == 0 { - SortByComponent::DocId { - order: SortOrder::Desc, + SortByPair { + first: SortByComponent::DocId { + order: SortOrder::Desc, + }, + second: None, } - .into() } else if num_sort_fields == 1 { let sort_field = &search_request.sort_fields[0]; let order = SortOrder::try_from(sort_field.sort_order).unwrap_or(SortOrder::Desc); - to_sort_by_component(&sort_field.field_name, order).into() + let first = to_sort_by_component(&sort_field.field_name, order); + SortByPair { + first, + second: None, + } } else if num_sort_fields == 2 { let sort_field1 = &search_request.sort_fields[0]; let order1 = SortOrder::try_from(sort_field1.sort_order).unwrap_or(SortOrder::Desc); @@ -1080,44 +1367,6 @@ pub(crate) fn make_merge_collector( }) } -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub struct SegmentPartialHitSortingKey { - sort_value: Option, - sort_value2: Option, - doc_id: DocId, - // TODO This should not be there. - sort_order: SortOrder, - // TODO This should not be there. - sort_order2: SortOrder, -} - -impl Ord for SegmentPartialHitSortingKey { - fn cmp(&self, other: &SegmentPartialHitSortingKey) -> Ordering { - debug_assert_eq!( - self.sort_order, other.sort_order, - "comparing two PartialHitSortingKey of different ordering" - ); - debug_assert_eq!( - self.sort_order2, other.sort_order2, - "comparing two PartialHitSortingKey of different ordering" - ); - let order = self - .sort_order - .compare_opt(&self.sort_value, &other.sort_value); - let order2 = self - .sort_order2 - .compare_opt(&self.sort_value2, &other.sort_value2); - let order_addr = self.sort_order.compare(&self.doc_id, &other.doc_id); - order.then(order2).then(order_addr) - } -} - -impl PartialOrd for SegmentPartialHitSortingKey { - fn partial_cmp(&self, other: &SegmentPartialHitSortingKey) -> Option { - Some(self.cmp(other)) - } -} - #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct PartialHitSortingKey { sort_value: Option, @@ -1169,8 +1418,8 @@ impl SortKeyMapper for HitSortingMapper { type Key = PartialHitSortingKey; fn get_sort_key(&self, partial_hit: &PartialHit) -> PartialHitSortingKey { PartialHitSortingKey { - sort_value: partial_hit.sort_value.and_then(|v| v.sort_value), - sort_value2: partial_hit.sort_value2.and_then(|v| v.sort_value), + sort_value: partial_hit.sort_value.clone().and_then(|v| v.sort_value), + sort_value2: partial_hit.sort_value2.clone().and_then(|v| v.sort_value), address: GlobalDocAddress::from_partial_hit(partial_hit), sort_order: self.order1, sort_order2: self.order2, @@ -1178,19 +1427,6 @@ impl SortKeyMapper for HitSortingMapper { } } -impl SortKeyMapper for HitSortingMapper { - type Key = SegmentPartialHitSortingKey; - fn get_sort_key(&self, partial_hit: &SegmentPartialHit) -> SegmentPartialHitSortingKey { - SegmentPartialHitSortingKey { - sort_value: partial_hit.sort_value, - sort_value2: partial_hit.sort_value2, - doc_id: partial_hit.doc_id, - sort_order: self.order1, - sort_order2: self.order2, - } - } -} - /// Incrementally merge segment results. #[derive(Clone)] pub(crate) struct IncrementalCollector { @@ -1307,9 +1543,10 @@ mod tests { use tantivy::aggregation::intermediate_agg_result::IntermediateAggregationResults; use tantivy::collector::Collector; - use super::{IncrementalCollector, make_merge_collector}; - use crate::QuickwitAggregations; - use crate::collector::{merge_intermediate_aggregation_result, top_k_partial_hits}; + use super::{ + IncrementalCollector, QuickwitAggregations, make_merge_collector, + merge_intermediate_aggregation_result, top_k_partial_hits, + }; #[test] fn test_merge_partial_hits_no_tie() { @@ -1394,66 +1631,52 @@ mod tests { ] } - fn make_request(max_hits: u64, sort_fields: &str) -> SearchRequest { - SearchRequest { - max_hits, - sort_fields: sort_fields - .split(',') - .filter(|field| !field.is_empty()) - .map(|field| { - if let Some(field) = field.strip_prefix('-') { - SortField { - field_name: field.to_string(), - sort_order: SortOrder::Asc.into(), - sort_datetime_format: None, - } - } else { - SortField { - field_name: field.to_string(), - sort_order: SortOrder::Desc.into(), - sort_datetime_format: None, - } + /// Create a list of SortField from a comma-separated list of field names. + /// Field names can be prefixed with - to indicate ascending order. + fn make_sort_fields(sort_fields: &str) -> Vec { + sort_fields + .split(',') + .filter(|field| !field.is_empty()) + .map(|field| { + if let Some(field) = field.strip_prefix('-') { + SortField { + field_name: field.to_string(), + sort_order: SortOrder::Asc.into(), + sort_datetime_format: None, } - }) - .collect(), - ..SearchRequest::default() - } + } else { + SortField { + field_name: field.to_string(), + sort_order: SortOrder::Desc.into(), + sort_datetime_format: None, + } + } + }) + .collect() } - fn make_index() -> tantivy::Index { + /// Build a tantivy index from a JSON dataset. Each element must be a JSON + /// object whose keys match field names in the pre-determined schema. + fn make_index(dataset: &[serde_json::Value]) -> tantivy::Index { use tantivy::Index; use tantivy::indexer::UserOperation; - use tantivy::schema::{NumericOptions, Schema}; - - let dataset = sort_dataset(); + use tantivy::schema::{FAST, NumericOptions, Schema}; let mut schema_builder = Schema::builder(); let opts = NumericOptions::default().set_fast(); - - schema_builder.add_u64_field("sort1", opts.clone()); - schema_builder.add_u64_field("sort2", opts); + schema_builder.add_u64_field("sort_u64_1", opts.clone()); + schema_builder.add_u64_field("sort_u64_2", opts); + schema_builder.add_json_field("kv", FAST); let schema = schema_builder.build(); - let field1 = schema.get_field("sort1").unwrap(); - let field2 = schema.get_field("sort2").unwrap(); - - let index = Index::create_in_ram(schema); + let index = Index::create_in_ram(schema.clone()); let mut index_writer = index.writer(50_000_000).unwrap(); index_writer .run( dataset - .into_iter() - .map(|(val1, val2)| { - let mut doc = TantivyDocument::new(); - if let Some(val1) = val1 { - doc.add_u64(field1, val1); - } - if let Some(val2) = val2 { - doc.add_u64(field2, val2); - } - doc - }) + .iter() + .map(|obj| TantivyDocument::parse_json(&schema, &obj.to_string()).unwrap()) .map(UserOperation::Add), ) .unwrap(); @@ -1463,8 +1686,22 @@ mod tests { } #[test] - fn test_single_split_sorting() { - let index = make_index(); + fn test_single_split_sorting_single_type() { + let raw_dataset = sort_dataset(); + let json_dataset: Vec = raw_dataset + .iter() + .map(|(v1, v2)| { + let mut obj = serde_json::Map::new(); + if let Some(v) = v1 { + obj.insert("sort_u64_1".to_string(), (*v).into()); + } + if let Some(v) = v2 { + obj.insert("sort_u64_2".to_string(), (*v).into()); + } + serde_json::Value::Object(obj) + }) + .collect(); + let index = make_index(&json_dataset); let reader = index.reader().unwrap(); let searcher = reader.searcher(); @@ -1472,7 +1709,7 @@ mod tests { // tuple of DocId and sort value type Doc = (usize, (Option, Option)); - let mut dataset: Vec = sort_dataset().into_iter().enumerate().collect(); + let mut dataset: Vec = raw_dataset.into_iter().enumerate().collect(); let reverse_int = |val: &Option| val.as_ref().map(|val| u64::MAX - val); let cmp_doc_id_desc = |a: &Doc, b: &Doc| b.0.cmp(&a.0); @@ -1532,25 +1769,27 @@ mod tests { assert_eq!(data, data_copy); } + // The implicit doc_id tiebreaker is always ascending, matching Elasticsearch's + // behavior where _shard_doc is always ascending regardless of primary sort direction. #[allow(clippy::type_complexity)] let sort_orders: Vec<(_, Box Ordering>)> = vec![ ("", Box::new(cmp_doc_id_desc)), ( - "sort1", + "sort_u64_1", Box::new(|a, b| cmp_1_desc(a, b).then(cmp_doc_id_desc(a, b))), ), ( - "-sort1", + "-sort_u64_1", Box::new(|a, b| cmp_1_asc(a, b).then(cmp_doc_id_asc(a, b))), ), ( - "sort1,sort2", + "sort_u64_1,sort_u64_2", Box::new(|a, b| { cmp_1_desc(a, b).then(cmp_2_desc(a, b).then(cmp_doc_id_desc(a, b))) }), ), ( - "-sort1,sort2", + "-sort_u64_1,sort_u64_2", Box::new(|a, b| { cmp_1_asc(a, b) .then(cmp_2_desc(a, b)) @@ -1558,11 +1797,11 @@ mod tests { }), ), ( - "sort1,-sort2", + "sort_u64_1,-sort_u64_2", Box::new(|a, b| cmp_1_desc(a, b).then(cmp_2_asc(a, b).then(cmp_doc_id_desc(a, b)))), ), ( - "-sort1,-sort2", + "-sort_u64_1,-sort_u64_2", Box::new(|a, b| { cmp_1_asc(a, b) .then(cmp_2_asc(a, b)) @@ -1577,7 +1816,11 @@ mod tests { for slice_len in 0..dataset.len() { let collector = super::make_collector_for_split( "fake_split_id".to_string(), - &make_request(slice_len as u64, sort_str), + &SearchRequest { + max_hits: slice_len as u64, + sort_fields: make_sort_fields(sort_str), + ..SearchRequest::default() + }, Default::default(), ) .unwrap(); @@ -1604,8 +1847,8 @@ mod tests { format!( "{} {:?} {:?}", hit.doc_id, - hit.sort_value.and_then(|el| el.sort_value).clone(), - hit.sort_value2.and_then(|el| el.sort_value).clone() + hit.sort_value.clone().and_then(|el| el.sort_value), + hit.sort_value2.clone().and_then(|el| el.sort_value) ) }) .collect::>(); @@ -1619,8 +1862,22 @@ mod tests { } #[test] - fn test_search_after() { - let index = make_index(); + fn test_search_after_single_type() { + let raw_dataset = sort_dataset(); + let json_dataset: Vec = raw_dataset + .iter() + .map(|(v1, v2)| { + let mut obj = serde_json::Map::new(); + if let Some(v) = v1 { + obj.insert("sort_u64_1".to_string(), (*v).into()); + } + if let Some(v) = v2 { + obj.insert("sort_u64_2".to_string(), (*v).into()); + } + serde_json::Value::Object(obj) + }) + .collect(); + let index = make_index(&json_dataset); let reader = index.reader().unwrap(); let searcher = reader.searcher(); @@ -1628,7 +1885,7 @@ mod tests { // tuple of DocId and sort value type Doc = (usize, (Option, Option)); - let mut dataset: Vec = sort_dataset().into_iter().enumerate().collect(); + let mut dataset: Vec = raw_dataset.into_iter().enumerate().collect(); let reverse_int = |val: &Option| val.as_ref().map(|val| u64::MAX - val); let cmp_doc_id_desc = |a: &Doc, b: &Doc| b.0.cmp(&a.0); @@ -1658,12 +1915,12 @@ mod tests { max_hits: 1000, sort_fields: vec![ SortField { - field_name: "sort1".to_string(), + field_name: "sort_u64_1".to_string(), sort_order: SortOrder::Desc.into(), sort_datetime_format: None, }, SortField { - field_name: "sort2".to_string(), + field_name: "sort_u64_2".to_string(), sort_order: SortOrder::Asc.into(), sort_datetime_format: None, }, @@ -1752,7 +2009,256 @@ mod tests { } } - fn merge_collector_equal_results( + fn assert_search_after_results( + searcher: &tantivy::Searcher, + index_len: usize, + sort_str: &str, + search_after: PartialHit, + expected_doc_ids: impl AsRef<[u32]>, + label: &str, + ) { + let expected_doc_ids = expected_doc_ids.as_ref(); + let request = SearchRequest { + max_hits: 1000, + sort_fields: make_sort_fields(sort_str), + search_after: Some(search_after.clone()), + ..SearchRequest::default() + }; + let collector = super::make_collector_for_split( + "fake_split_id".to_string(), + &request, + Default::default(), + ) + .unwrap(); + let Ok(res) = searcher.search(&tantivy::query::AllQuery, &collector) else { + panic!("search failed for {label} with search_after {search_after:?}"); + }; + // num_hits counts every doc regardless of search_after. + assert_eq!( + res.num_hits, index_len as u64, + "num_hits mismatch for {label}" + ); + assert_eq!( + res.partial_hits.len(), + expected_doc_ids.len(), + "result count mismatch for {label}" + ); + for (expected_doc_id, got) in expected_doc_ids.iter().zip(res.partial_hits.iter()) { + assert_eq!( + *expected_doc_id, got.doc_id, + "doc order mismatch for {label} after {search_after:?}" + ); + } + } + + #[test] + fn test_single_split_search_after_multitype() { + let dataset: Vec = vec![ + serde_json::json!({"kv": {"sort1": false, "sort2": "b"}}), // doc 0 + serde_json::json!({"kv": {"sort1": true, "sort2": "a"}}), // doc 1 + serde_json::json!({"kv": {"sort1": "apple", "sort2": "a"}}), // doc 2 + serde_json::json!({"kv": {"sort1": "banana", "sort2": "b"}}), // doc 3 + serde_json::json!({"kv": {"sort1": 1, "sort2": "b"}}), // doc 4 + serde_json::json!({"kv": {"sort1": 5, "sort2": "a"}}), // doc 5 + serde_json::json!({}), // doc 6: missing + ]; + + let index = make_index(&dataset); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + + for (sort_str, expected_order) in [ + // Desc: booleans (true first) > strings (lex desc) > numbers (largest first) > missing + ("kv.sort1", &[1, 0, 3, 2, 5, 4, 6]), + // Asc: numbers (smallest first) > strings (lex asc) > booleans (false first) > + // missing + ("-kv.sort1", &[4, 5, 2, 3, 0, 1, 6]), + ("", &[6, 5, 4, 3, 2, 1, 0]), + ("_doc", &[6, 5, 4, 3, 2, 1, 0]), + ("-_doc", &[0, 1, 2, 3, 4, 5, 6]), + // sort2 with "b" first then "a" + ("kv.sort2,kv.sort1", &[0, 3, 4, 1, 2, 5, 6]), + // sort2 with "a" first then "b" + ("-kv.sort2,kv.sort1", &[1, 2, 5, 0, 3, 4, 6]), + ] { + // Step 1: full search to collect PartialHits carrying the correct typed SortValues. + let collector = super::make_collector_for_split( + "fake_split_id".to_string(), + &SearchRequest { + max_hits: 1000, + sort_fields: make_sort_fields(sort_str), + ..Default::default() + }, + Default::default(), + ) + .unwrap(); + let full_res = searcher + .search(&tantivy::query::AllQuery, &collector) + .unwrap(); + assert_eq!(full_res.partial_hits.len(), dataset.len()); + for (expected_doc_id, got) in expected_order.iter().zip(full_res.partial_hits.iter()) { + assert_eq!( + *expected_doc_id, got.doc_id, + "sort order mismatch for \"{sort_str}\"" + ); + } + + // Step 2: use each PartialHit as a search_after fence and verify the returned tail. + for (i, search_after) in full_res.partial_hits.iter().enumerate() { + assert_search_after_results( + &searcher, + dataset.len(), + sort_str, + search_after.clone(), + &expected_order[i + 1..], + &format!("\"{sort_str}\" search_after position {i}"), + ); + } + } + } + + #[test] + fn test_single_split_search_after_exogeneous_type() { + let dataset: Vec = vec![ + serde_json::json!({"kv": {"mixed": false, "integer": 1}}), // doc 0 + serde_json::json!({"kv": {"mixed": true, "integer": 4}}), // doc 1 + serde_json::json!({"kv": {"mixed": "banana", "integer": 3}}), // doc 2 + serde_json::json!({"kv": {"mixed": "plum", "integer": 4}}), // doc 3 + ]; + + let index = make_index(&dataset); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let str_sort_val = |s: &str| SortValue::Str(s.to_string()); + for (sort_str, search_after_value, expected_order) in [ + // Desc: booleans (true first) > strings (lex desc) > numbers (search after) > missing + ("kv.mixed", SortValue::I64(-10), vec![]), + // Asc: numbers (search after) > strings (lex asc) > booleans (false first) > missing + ("-kv.mixed", SortValue::I64(-10), vec![2, 3, 0, 1]), + // project f64 to i64 + ("kv.integer", SortValue::F64(3.5), vec![2, 0]), + ("-kv.integer", SortValue::F64(3.5), vec![1, 3]), + // str not in columns dict, check all possible relative position + ("kv.mixed", str_sort_val("c"), vec![2]), + ("-kv.mixed", str_sort_val("c"), vec![3, 0, 1]), + ("kv.mixed", str_sort_val("a"), vec![]), + ("-kv.mixed", str_sort_val("a"), vec![2, 3, 0, 1]), + ("kv.mixed", str_sort_val("z"), vec![3, 2]), + ("-kv.mixed", str_sort_val("z"), vec![0, 1]), + ] { + assert_search_after_results( + &searcher, + dataset.len(), + sort_str, + PartialHit { + sort_value: Some(search_after_value.clone().into()), + sort_value2: None, + ..Default::default() + }, + expected_order, + &format!("\"{sort_str}\""), + ); + } + } + + #[test] + fn test_single_split_search_after_exogeneous_type_with_null() { + let dataset: Vec = vec![ + serde_json::json!({"kv": {"sort": false}}), // doc 0 + serde_json::json!({"kv": {"sort": true}}), // doc 1 + serde_json::json!({"kv": {"sort": "apple"}}), // doc 2 + serde_json::json!({"kv": {"sort": "banana"}}), // doc 3 + serde_json::json!({}), // doc 4: missing + ]; + + let index = make_index(&dataset); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + + let search_after_value = SortValue::I64(-10); + + // Desc: booleans (true first) > strings (lex desc) > numbers (search after) > missing + let desc_order: &[u32] = &[4]; + // Asc: numbers (search after) > strings (lex asc) > booleans (false first) > missing + let asc_order: &[u32] = &[2, 3, 0, 1, 4]; + + for (sort_str, expected_order) in [("kv.sort", desc_order), ("-kv.sort", asc_order)] { + assert_search_after_results( + &searcher, + dataset.len(), + sort_str, + PartialHit { + sort_value: Some(search_after_value.clone().into()), + sort_value2: None, + ..Default::default() + }, + expected_order, + &format!("\"{sort_str}\""), + ); + } + } + + #[test] + fn test_single_split_default_sort() { + let dataset: Vec = vec![ + serde_json::json!({"sort_u64_1": 15}), // doc 0 + serde_json::json!({"sort_u64_1": 13}), // doc 1 + serde_json::json!({"sort_u64_1": 10}), // doc 2 + serde_json::json!({"sort_u64_1": 12}), // doc 3 + serde_json::json!({"sort_u64_1": 9}), // doc 4 + ]; + + let index = make_index(&dataset); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + + let request = SearchRequest { + max_hits: 3, + sort_fields: vec![], + search_after: None, + ..SearchRequest::default() + }; + let collector = super::make_collector_for_split( + "fake_split_id".to_string(), + &request, + Default::default(), + ) + .unwrap(); + let res = searcher + .search(&tantivy::query::AllQuery, &collector) + .unwrap(); + // assert the exact hits where in other tests we mostly focus on the order + assert_eq!( + res.partial_hits, + vec![ + PartialHit { + split_id: "fake_split_id".to_string(), + segment_ord: 0, + doc_id: 4, + sort_value: None, + sort_value2: None, + }, + PartialHit { + split_id: "fake_split_id".to_string(), + segment_ord: 0, + doc_id: 3, + sort_value: None, + sort_value2: None, + }, + PartialHit { + split_id: "fake_split_id".to_string(), + segment_ord: 0, + doc_id: 2, + sort_value: None, + sort_value2: None, + }, + ] + ); + } + + /// Merge intermediate results, asserting that both the regular and + /// incremental merge produce the same output. + fn merge_on_both_collectors( request: &SearchRequest, results: Vec, ) -> LeafSearchResponse { @@ -1774,7 +2280,7 @@ mod tests { #[test] fn test_merge_collectors() { - let result = merge_collector_equal_results( + let result = merge_on_both_collectors( &SearchRequest { start_offset: 0, max_hits: 2, @@ -1822,7 +2328,7 @@ mod tests { } ); - let result = merge_collector_equal_results( + let result = merge_on_both_collectors( &SearchRequest { start_offset: 0, max_hits: 2, @@ -1914,7 +2420,7 @@ mod tests { ); // same request, but we reverse sort order - let result = merge_collector_equal_results( + let result = merge_on_both_collectors( &SearchRequest { start_offset: 0, max_hits: 2, diff --git a/quickwit/quickwit-search/src/error.rs b/quickwit/quickwit-search/src/error.rs index 21141a3035a..71d67ca4bc8 100644 --- a/quickwit/quickwit-search/src/error.rs +++ b/quickwit/quickwit-search/src/error.rs @@ -46,6 +46,8 @@ pub enum SearchError { Timeout(String), #[error("too many requests")] TooManyRequests, + #[error("too many splits: {0}")] + TooManySplits(String), #[error("service unavailable: {0}")] Unavailable(String), } @@ -87,6 +89,7 @@ impl ServiceError for SearchError { } Self::Timeout(_) => ServiceErrorCode::Timeout, Self::TooManyRequests => ServiceErrorCode::TooManyRequests, + Self::TooManySplits(_) => ServiceErrorCode::BadRequest, Self::Unavailable(_) => ServiceErrorCode::Unavailable, } } diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 3d9e5d00cce..63504e474b3 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -52,6 +52,7 @@ use crate::metrics::{SplitSearchOutcomeCounters, queue_label}; use crate::root::is_metadata_count_request_with_ast; use crate::search_permit_provider::{SearchPermit, compute_initial_memory_allocation}; use crate::service::{SearcherContext, deserialize_doc_mapper}; +use crate::soft_delete_query::SoftDeleteQuery; use crate::{QuickwitAggregations, SearchError}; async fn get_split_footer_from_cache_or_fetch( @@ -218,16 +219,10 @@ pub(crate) async fn warmup(searcher: &Searcher, warmup_info: &WarmupInfo) -> any let warm_up_term_ranges_future = warm_up_term_ranges(searcher, &warmup_info.term_ranges_grouped_by_field) .instrument(debug_span!("warm_up_term_ranges")); - let warm_up_term_dict_future = - warm_up_term_dict_fields(searcher, &warmup_info.term_dict_fields) - .instrument(debug_span!("warm_up_term_dicts")); let warm_up_fastfields_future = warm_up_fastfields(searcher, &warmup_info.fast_fields) .instrument(debug_span!("warm_up_fastfields")); let warm_up_fieldnorms_future = warm_up_fieldnorms(searcher, warmup_info.field_norms) .instrument(debug_span!("warm_up_fieldnorms")); - // TODO merge warm_up_postings into warm_up_term_dict_fields - let warm_up_postings_future = warm_up_postings(searcher, &warmup_info.term_dict_fields) - .instrument(debug_span!("warm_up_postings")); let warm_up_automatons_future = warm_up_automatons(searcher, &warmup_info.automatons_grouped_by_field) .instrument(debug_span!("warm_up_automatons")); @@ -236,45 +231,13 @@ pub(crate) async fn warmup(searcher: &Searcher, warmup_info: &WarmupInfo) -> any warm_up_terms_future, warm_up_term_ranges_future, warm_up_fastfields_future, - warm_up_term_dict_future, warm_up_fieldnorms_future, - warm_up_postings_future, warm_up_automatons_future, )?; Ok(()) } -async fn warm_up_term_dict_fields( - searcher: &Searcher, - term_dict_fields: &HashSet, -) -> anyhow::Result<()> { - let mut warm_up_futures = Vec::new(); - for field in term_dict_fields { - for segment_reader in searcher.segment_readers() { - let inverted_index = segment_reader.inverted_index(*field)?.clone(); - warm_up_futures.push(async move { - let dict = inverted_index.terms(); - dict.warm_up_dictionary().await - }); - } - } - try_join_all(warm_up_futures).await?; - Ok(()) -} - -async fn warm_up_postings(searcher: &Searcher, fields: &HashSet) -> anyhow::Result<()> { - let mut warm_up_futures = Vec::new(); - for field in fields { - for segment_reader in searcher.segment_readers() { - let inverted_index = segment_reader.inverted_index(*field)?.clone(); - warm_up_futures.push(async move { inverted_index.warm_postings_full(false).await }); - } - } - try_join_all(warm_up_futures).await?; - Ok(()) -} - async fn warm_up_fastfield( fast_field_reader: &FastFieldReaders, fast_field: &FastFieldWarmupInfo, @@ -389,6 +352,10 @@ async fn warm_up_automatons( .await .context("failed to load automaton") } + Automaton::TermSet(automaton) => inv_idx_clone + .warm_postings_automaton(automaton.clone(), cpu_intensive_executor) + .await + .context("failed to warm term set"), } }); } @@ -475,7 +442,10 @@ async fn leaf_search_single_split( // if is_metadata_count_request_with_ast(&query_ast, &search_request) { leaf_search_state_guard.set_state(SplitSearchState::PrunedBeforeWarmup); - return Ok(Some(get_leaf_resp_from_count(split.num_docs))); + let effective_num_docs = split + .num_docs + .saturating_sub(split.soft_deleted_doc_ids.len() as u64); + return Ok(Some(get_leaf_resp_from_count(effective_num_docs))); } let split_id = split.split_id.to_string(); @@ -526,6 +496,14 @@ async fn leaf_search_single_split( false, predicate_cache, )?; + let query: Box = if split.soft_deleted_doc_ids.is_empty() { + query + } else { + Box::new(SoftDeleteQuery::new( + query, + split.soft_deleted_doc_ids.clone(), + )) + }; let collector_warmup_info = collector.warmup_info(); warmup_info.merge(collector_warmup_info); @@ -548,7 +526,6 @@ async fn leaf_search_single_split( .leaf_search_single_split_warmup_num_bytes .observe(warmup_size.as_u64() as f64); search_permit.update_memory_usage(warmup_size); - search_permit.free_warmup_slot(); let split_num_docs = split.num_docs; @@ -576,7 +553,10 @@ async fn leaf_search_single_split( collector.update_search_param(&simplified_search_request); let mut leaf_search_response: LeafSearchResponse = if is_metadata_count_request_with_ast(&query_ast, &simplified_search_request) { - get_leaf_resp_from_count(searcher.num_docs()) + let num_docs = searcher + .num_docs() + .saturating_sub(split_clone.soft_deleted_doc_ids.len() as u64); + get_leaf_resp_from_count(num_docs) } else if collector.is_count_only() { let count = query.count(&searcher)? as u64; get_leaf_resp_from_count(count) @@ -809,28 +789,24 @@ fn remove_redundant_timestamp_range( } } (Bound::Unbounded, Some(_)) => Bound::Unbounded, - (timestamp, None) => timestamp, + (query_bound, None) => query_bound, }; - let final_end_timestamp = match ( - visitor.end_timestamp, - split.timestamp_end.map(DateTime::from_timestamp_secs), - ) { - (Bound::Included(query_ts), Some(split_ts)) => { - if query_ts < split_ts { - Bound::Included(query_ts) - } else { - Bound::Unbounded - } - } - (Bound::Excluded(query_ts), Some(split_ts)) => { - if query_ts <= split_ts { - Bound::Excluded(query_ts) + let final_end_timestamp = match (visitor.end_timestamp, split.timestamp_end) { + ( + query_bound @ (Bound::Included(query_ts) | Bound::Excluded(query_ts)), + Some(split_end), + ) => { + // split.timestamp_end is the truncation of the highest timestamp in the split, + // so the actual known bound for the split is split.timestamp_end+1 (exclusive) + let split_end_exclusive = DateTime::from_timestamp_secs(split_end + 1); + if query_ts < split_end_exclusive { + query_bound } else { Bound::Unbounded } } (Bound::Unbounded, Some(_)) => Bound::Unbounded, - (timestamp, None) => timestamp, + (query_bound, None) => query_bound, }; if final_start_timestamp != Bound::Unbounded || final_end_timestamp != Bound::Unbounded { let range = RangeQuery { @@ -1688,6 +1664,11 @@ mod tests { }; remove_timestamp_test_case(&search_request, &split, None); + let expected_upper_inclusive = RangeQuery { + field: timestamp_field.to_string(), + lower_bound: Bound::Unbounded, + upper_bound: Bound::Included((time3 * S_TO_NS).into()), + }; let search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::Range(RangeQuery { field: timestamp_field.to_string(), @@ -1697,7 +1678,7 @@ mod tests { .unwrap(), ..SearchRequest::default() }; - remove_timestamp_test_case(&search_request, &split, None); + remove_timestamp_test_case(&search_request, &split, Some(expected_upper_inclusive)); let search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::MatchAll).unwrap(), @@ -1740,10 +1721,10 @@ mod tests { Some(expected_upper_exclusive.clone()), ); - let expected_lower_exclusive = RangeQuery { + let expected_lower_excl_upper_incl = RangeQuery { field: timestamp_field.to_string(), lower_bound: Bound::Excluded((time2 * S_TO_NS).into()), - upper_bound: Bound::Unbounded, + upper_bound: Bound::Included((time3 * S_TO_NS).into()), }; let search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::Range(RangeQuery { @@ -1757,10 +1738,22 @@ mod tests { remove_timestamp_test_case( &search_request, &split, - Some(expected_lower_exclusive.clone()), + Some(expected_lower_excl_upper_incl.clone()), ); + } + + #[test] + fn test_remove_timestamp_range_multiple_bounds() { + // When bounds are defined both in the AST and in the search request, + // make sure we take the most restrictive ones. + const S_TO_NS: i64 = 1_000_000_000; + let time1 = 1700001000; + let time2 = 1700002000; + let time3 = 1700003000; + let time4 = 1700004000; + + let timestamp_field = "timestamp".to_string(); - // we take the most restrictive bounds let split = SplitIdAndFooterOffsets { timestamp_start: Some(time1), timestamp_end: Some(time4), @@ -1803,10 +1796,10 @@ mod tests { }; remove_timestamp_test_case(&search_request, &split, Some(expected_upper_2_inc)); - let expected_lower_3 = RangeQuery { + let expected_lower_3_upper_4 = RangeQuery { field: timestamp_field.to_string(), lower_bound: Bound::Included((time3 * S_TO_NS).into()), - upper_bound: Bound::Unbounded, + upper_bound: Bound::Included((time4 * S_TO_NS).into()), }; let search_request = SearchRequest { @@ -1820,7 +1813,11 @@ mod tests { end_timestamp: Some(time4 + 1), ..SearchRequest::default() }; - remove_timestamp_test_case(&search_request, &split, Some(expected_lower_3.clone())); + remove_timestamp_test_case( + &search_request, + &split, + Some(expected_lower_3_upper_4.clone()), + ); let search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::Range(RangeQuery { @@ -1833,7 +1830,7 @@ mod tests { end_timestamp: Some(time4 + 1), ..SearchRequest::default() }; - remove_timestamp_test_case(&search_request, &split, Some(expected_lower_3)); + remove_timestamp_test_case(&search_request, &split, Some(expected_lower_3_upper_4)); let mut search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::MatchAll).unwrap(), diff --git a/quickwit/quickwit-search/src/leaf_cache.rs b/quickwit/quickwit-search/src/leaf_cache.rs index abc756763ef..c93cd190c3c 100644 --- a/quickwit/quickwit-search/src/leaf_cache.rs +++ b/quickwit/quickwit-search/src/leaf_cache.rs @@ -85,6 +85,9 @@ struct CacheKey { /// The effective time range of the request, that is, the intersection of the timerange /// requested, and the timerange covered by the split. merged_time_range: HalfOpenRange, + /// The number of soft deleted documents in the split. + /// This assumes that the list of deleted docs is append only for a split. + soft_deleted_docs_len: usize, } impl CacheKey { @@ -106,6 +109,7 @@ impl CacheKey { split_id: split_info.split_id, request: search_request, merged_time_range, + soft_deleted_docs_len: split_info.soft_deleted_doc_ids.len(), } } } @@ -253,6 +257,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let split_2 = SplitIdAndFooterOffsets { @@ -262,6 +267,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let query_1 = SearchRequest { @@ -319,6 +325,7 @@ mod tests { timestamp_start: Some(100), timestamp_end: Some(199), num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let split_2 = SplitIdAndFooterOffsets { split_id: "split_2".to_string(), @@ -327,6 +334,7 @@ mod tests { timestamp_start: Some(150), timestamp_end: Some(249), num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let split_3 = SplitIdAndFooterOffsets { split_id: "split_3".to_string(), @@ -335,6 +343,7 @@ mod tests { timestamp_start: Some(150), timestamp_end: Some(249), num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let query_1 = SearchRequest { diff --git a/quickwit/quickwit-search/src/lib.rs b/quickwit/quickwit-search/src/lib.rs index 33a21664c3f..74266f42bf2 100644 --- a/quickwit/quickwit-search/src/lib.rs +++ b/quickwit/quickwit-search/src/lib.rs @@ -35,7 +35,10 @@ mod scroll_context; mod search_job_placer; mod search_response_rest; mod service; +mod soft_delete_query; +mod sort_repr; pub(crate) mod top_k_collector; +mod top_k_computer; mod metrics; mod search_permit_provider; @@ -172,6 +175,11 @@ fn extract_split_and_footer_offsets(split_metadata: &SplitMetadata) -> SplitIdAn .as_ref() .map(|time_range| *time_range.end()), num_docs: split_metadata.num_docs as u64, + soft_deleted_doc_ids: split_metadata + .soft_deleted_doc_ids + .iter() + .copied() + .collect(), } } diff --git a/quickwit/quickwit-search/src/list_fields_cache.rs b/quickwit/quickwit-search/src/list_fields_cache.rs index 681ce7a2e77..c940893b722 100644 --- a/quickwit/quickwit-search/src/list_fields_cache.rs +++ b/quickwit/quickwit-search/src/list_fields_cache.rs @@ -83,6 +83,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let split_2 = SplitIdAndFooterOffsets { @@ -92,6 +93,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let result = ListFieldsEntryResponse { diff --git a/quickwit/quickwit-search/src/metrics.rs b/quickwit/quickwit-search/src/metrics.rs index db4083a7eed..e236fb53d06 100644 --- a/quickwit/quickwit-search/src/metrics.rs +++ b/quickwit/quickwit-search/src/metrics.rs @@ -105,9 +105,9 @@ impl SplitSearchOutcomeCounters { } pub struct SearchMetrics { - pub root_search_requests_total: IntCounterVec<1>, - pub root_search_request_duration_seconds: HistogramVec<1>, - pub root_search_targeted_splits: HistogramVec<1>, + pub root_search_requests_total: IntCounterVec<2>, + pub root_search_request_duration_seconds: HistogramVec<2>, + pub root_search_targeted_splits: HistogramVec<2>, pub leaf_search_requests_total: IntCounterVec<2>, pub leaf_search_request_duration_seconds: HistogramVec<2>, pub leaf_search_targeted_splits: HistogramVec<2>, @@ -170,14 +170,14 @@ impl Default for SearchMetrics { "Total number of root search gRPC requests processed.", "search", &[("kind", "server")], - ["status"], + ["user_agent", "status"], ), root_search_request_duration_seconds: new_histogram_vec( "root_search_request_duration_seconds", "Duration of root search gRPC requests in seconds.", "search", &[("kind", "server")], - ["status"], + ["user_agent", "status"], duration_buckets(), ), root_search_targeted_splits: new_histogram_vec( @@ -185,7 +185,7 @@ impl Default for SearchMetrics { "Number of splits targeted per root search GRPC request.", "search", &[], - ["status"], + ["user_agent", "status"], targeted_splits_buckets.clone(), ), leaf_search_requests_total: new_counter_vec( diff --git a/quickwit/quickwit-search/src/metrics_trackers.rs b/quickwit/quickwit-search/src/metrics_trackers.rs index 9539ac2e098..6d9648a5c0e 100644 --- a/quickwit/quickwit-search/src/metrics_trackers.rs +++ b/quickwit/quickwit-search/src/metrics_trackers.rs @@ -20,6 +20,7 @@ use std::time::Instant; use pin_project::{pin_project, pinned_drop}; use quickwit_proto::search::{LeafSearchResponse, SearchResponse}; +use tracing::{Span, record_all}; use crate::SearchError; use crate::metrics::{SEARCH_METRICS, queue_label}; @@ -34,20 +35,26 @@ pub struct SearchPlanMetricsFuture { #[pin] pub tracked: F, pub start: Instant, - pub is_success: Option, + pub status: Option>, + pub user_agent: String, + pub req_span: Span, } #[pinned_drop] impl PinnedDrop for SearchPlanMetricsFuture { fn drop(self: Pin<&mut Self>) { - let status = match self.is_success { + let status = match self.status { // this is a partial success, actual status will be recorded during the search step - Some(true) => return, - Some(false) => "plan-error", - None => "plan-cancelled", + Some(Ok(())) => return, + Some(Err(error)) => error, + None => { + let _guard = self.req_span.enter(); + tracing::info!("root search cancelled"); + "plan-cancelled" + } }; - let label_values = [status]; + let label_values = [normalize_user_agent(&self.user_agent), status]; SEARCH_METRICS .root_search_requests_total .with_label_values(label_values) @@ -68,9 +75,14 @@ where F: Future> let this = self.project(); let response = ready!(this.tracked.poll(cx)); if let Err(err) = &response { + let _guard = this.req_span.enter(); tracing::error!(?err, "root search planning failed"); } - *this.is_success = Some(response.is_ok()); + *this.status = match &response { + Ok(_) => Some(Ok(())), + Err(SearchError::TooManySplits(_)) => Some(Err("too-many-splits")), + Err(_) => Some(Err("plan-error")), + }; Poll::Ready(Ok(response?)) } } @@ -85,13 +97,19 @@ pub struct RootSearchMetricsFuture { pub start: Instant, pub num_targeted_splits: usize, pub status: Option<&'static str>, + pub user_agent: String, + pub req_span: Span, } #[pinned_drop] impl PinnedDrop for RootSearchMetricsFuture { fn drop(self: Pin<&mut Self>) { + if self.status.is_none() { + let _guard = self.req_span.enter(); + tracing::info!("root search cancelled"); + } let status = self.status.unwrap_or("cancelled"); - let label_values = [status]; + let label_values = [normalize_user_agent(&self.user_agent), status]; SEARCH_METRICS .root_search_requests_total .with_label_values(label_values) @@ -115,18 +133,25 @@ where F: Future> fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { let this = self.project(); let response = ready!(this.tracked.poll(cx)); + record_all!(this.req_span, elapsed_ms = this.start.elapsed().as_millis()); + let _guard = this.req_span.enter(); if let Err(err) = &response { tracing::error!(?err, "root search failed"); - } - if let Ok(resp) = &response { + *this.status = Some("error"); + } else if let Ok(resp) = &response { if resp.failed_splits.is_empty() { *this.status = Some("success"); + tracing::info!("root search success"); } else { *this.status = Some("partial-success"); + tracing::error!( + failed_splits = resp.failed_splits.len(), + first_failed_split = ?resp.failed_splits.first().unwrap(), + "root search partial success" + ); } - } else { - *this.status = Some("error"); } + Poll::Ready(Ok(response?)) } } @@ -182,3 +207,38 @@ where F: Future> Poll::Ready(Ok(response?)) } } + +/// Simplify the user agent to limit the metric's cardinality. +pub fn normalize_user_agent(user_agent: &str) -> &str { + let ua = user_agent.trim(); + + // Browsers always start with "Mozilla/" + if ua.starts_with("Mozilla") { + return "browser"; + } + + let lower = ua.to_ascii_lowercase(); + + // Well-known CLI / library prefixes (match on the start of the lower-cased + // string so version numbers don't matter). + const CLI_PREFIXES: &[&str] = &[ + "curl", + "wget", + "python-httpx", + "python-requests", + "elasticsearch-py", + "go-http-client", + "java", + "okhttp", + "axios", + "ruby", + "node-fetch", + "node", + ]; + if let Some(&prefix) = CLI_PREFIXES.iter().find(|p| lower.starts_with(*p)) { + return prefix; + } + + // Keep short service names verbatim; truncate anything exotic. + if ua.len() <= 64 { ua } else { "other" } +} diff --git a/quickwit/quickwit-search/src/retry/mod.rs b/quickwit/quickwit-search/src/retry/mod.rs index 996665717cf..a496159d76c 100644 --- a/quickwit/quickwit-search/src/retry/mod.rs +++ b/quickwit/quickwit-search/src/retry/mod.rs @@ -128,6 +128,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let client_for_retry = retry_client( &search_job_placer, diff --git a/quickwit/quickwit-search/src/retry/search.rs b/quickwit/quickwit-search/src/retry/search.rs index 696a352de94..7ae744c8625 100644 --- a/quickwit/quickwit-search/src/retry/search.rs +++ b/quickwit/quickwit-search/src/retry/search.rs @@ -93,6 +93,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }, SplitIdAndFooterOffsets { split_id: "split_2".to_string(), @@ -101,6 +102,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }, ], }], diff --git a/quickwit/quickwit-search/src/root.rs b/quickwit/quickwit-search/src/root.rs index 246d3308636..8978e114dce 100644 --- a/quickwit/quickwit-search/src/root.rs +++ b/quickwit/quickwit-search/src/root.rs @@ -45,7 +45,7 @@ use tantivy::aggregation::agg_result::AggregationResults; use tantivy::aggregation::intermediate_agg_result::IntermediateAggregationResults; use tantivy::collector::Collector; use tantivy::schema::{Field, FieldEntry, FieldType, Schema}; -use tracing::{debug, info, info_span, instrument}; +use tracing::{debug, info_span, instrument, record_all}; use crate::cluster_client::ClusterClient; use crate::collector::{QuickwitAggregations, make_merge_collector}; @@ -161,12 +161,18 @@ pub struct IndexMetasForLeafSearch { pub(crate) type IndexesMetasForLeafSearch = HashMap; +/// Maps to `true` if the field mapping of all indexes is `datetime` for the +/// given sort field. Contains an entry for every sort field. Does not ensure +/// that the field is indeed a datetime in all splits (doc mapping might +/// have been updated). +type SortFieldsIsDatetime = HashMap; + #[derive(Debug)] struct RequestMetadata { timestamp_field_opt: Option, query_ast_resolved: QueryAst, indexes_meta_for_leaf_search: IndexesMetasForLeafSearch, - sort_fields_is_datetime: HashMap, + sort_fields_is_datetime: SortFieldsIsDatetime, } /// Validates request against each index's doc mapper and ensures that: @@ -189,11 +195,10 @@ fn validate_request_and_build_metadata( )?; let query_ast: QueryAst = serde_json::from_str(&search_request.query_ast) .map_err(|err| SearchError::InvalidQuery(err.to_string()))?; - let mut indexes_meta_for_leaf_search: HashMap = - HashMap::new(); + let mut indexes_meta_for_leaf_search: IndexesMetasForLeafSearch = HashMap::new(); let mut query_ast_resolved_opt: Option = None; let mut timestamp_field_opt: Option = None; - let mut sort_fields_is_datetime: HashMap = HashMap::new(); + let mut sort_fields_is_datetime: SortFieldsIsDatetime = HashMap::new(); for index_metadata in indexes_metadata { let doc_mapper = build_doc_mapper( @@ -315,7 +320,7 @@ fn validate_secondary_time(index_metadata: &[IndexMetadata]) -> crate::Result, + sort_field_is_datetime: &mut SortFieldsIsDatetime, ) -> crate::Result<()> { for sort_field in sort_fields.iter() { if let Some(sort_field_entry) = get_sort_by_field_entry(&sort_field.field_name, schema)? { @@ -402,6 +407,7 @@ fn simplify_search_request_for_scroll_api(req: &SearchRequest) -> crate::Result< count_hits: quickwit_proto::search::CountHits::Underestimate as i32, ignore_missing_indexes: req.ignore_missing_indexes, split_id: req.split_id.clone(), + user_agent: req.user_agent.clone(), }) } @@ -439,16 +445,10 @@ fn validate_sort_by_fields_and_search_after( } let mut search_after_sort_value_count = 0; - // TODO: we could validate if the search after sort value types of consistent with the sort - // field types. - if let Some(sort_by_value) = search_after_partial_hit.sort_value.as_ref() { - sort_by_value.sort_value.context("sort value must be set")?; + if search_after_partial_hit.sort_value.is_some() { search_after_sort_value_count += 1; } - if let Some(sort_by_value_2) = search_after_partial_hit.sort_value2.as_ref() { - sort_by_value_2 - .sort_value - .context("sort value must be set")?; + if search_after_partial_hit.sort_value2.is_some() { search_after_sort_value_count += 1; } if search_after_sort_value_count != sort_fields_without_doc_count { @@ -486,11 +486,6 @@ fn validate_sort_by_field_type( has_timestamp_format: bool, ) -> crate::Result<()> { let field_name = sort_by_field_entry.name(); - if matches!(sort_by_field_entry.field_type(), FieldType::Str(_)) { - return Err(SearchError::InvalidArgument(format!( - "sort by field on type text is currently not supported `{field_name}`" - ))); - } if !sort_by_field_entry.is_fast() { return Err(SearchError::InvalidArgument(format!( "sort by field must be a fast field, please add the fast property to your field \ @@ -710,7 +705,8 @@ pub fn get_count_from_metadata(split_metadatas: &[SplitMetadata]) -> Vec, query_ast_resolved: QueryAst, - sort_fields_is_datetime: HashMap, + sort_fields_is_datetime: SortFieldsIsDatetime, timestamp_field_opt: Option, secondary_timestamp_field_opt: Option, ) -> crate::Result> { @@ -1213,6 +1204,7 @@ async fn refine_and_list_matches( async fn plan_splits_for_root_search( search_request: &mut SearchRequest, metastore: &mut MetastoreServiceClient, + max_splits_per_search: Option, ) -> crate::Result<(Vec, IndexesMetasForLeafSearch)> { let list_indexes_metadatas_request = ListIndexesMetadataRequest { index_id_patterns: search_request.index_id_patterns.clone(), @@ -1246,12 +1238,47 @@ async fn plan_splits_for_root_search( secondary_timestamp_field_opt, ) .await?; + + let num_targeted_splits = split_metadatas.len(); + if let Some(max_total_split_searches) = max_splits_per_search + && num_targeted_splits > max_total_split_searches + { + return Err(SearchError::TooManySplits(format!( + "Targeted split limit exceeded ({num_targeted_splits}>{max_total_split_searches})" + ))); + } + Ok(( split_metadatas, request_metadata.indexes_meta_for_leaf_search, )) } +fn record_request_span(search_request: &SearchRequest) -> tracing::Span { + let span = info_span!( + "request", + indexes = ?PrettySample::new(&search_request.index_id_patterns, 5), + user_agent = search_request.user_agent.as_deref().unwrap_or_default(), + query_ast = %search_request.query_ast, + count_required = search_request.count_hits().as_str_name(), + agg = tracing::field::Empty, + ts_range = tracing::field::Empty, + elapsed_ms = tracing::field::Empty, + targeted_splits_bytes = tracing::field::Empty, + num_targeted_splits = tracing::field::Empty, + ); + if let Some(agg) = search_request.aggregation_request.as_ref() { + record_all!(span, agg = %agg); + } + if search_request.start_timestamp.is_some() || search_request.end_timestamp.is_some() { + record_all!( + span, + ts_range = ?search_request.start_timestamp..search_request.end_timestamp, + ); + } + span +} + /// Performs a distributed search. /// 1. Sends leaf requests over gRPC to multiple leaf nodes. /// 2. Merges the search results. @@ -1266,45 +1293,31 @@ pub async fn root_search( ) -> crate::Result { let start_instant = Instant::now(); + let req_span = record_request_span(&search_request); + let (split_metadatas, indexes_meta_for_leaf_search) = SearchPlanMetricsFuture { start: start_instant, - tracked: plan_splits_for_root_search(&mut search_request, &mut metastore), - is_success: None, + user_agent: search_request.user_agent.clone().unwrap_or_default(), + tracked: plan_splits_for_root_search( + &mut search_request, + &mut metastore, + searcher_context.searcher_config.max_splits_per_search, + ), + status: None, + req_span: req_span.clone(), } .await?; - let num_docs: usize = split_metadatas.iter().map(|split| split.num_docs).sum(); - let num_splits = split_metadatas.len(); - - // It would have been nice to add those in the context of the trace span, - // but with our current logging setting, it makes logs too verbose. - info!( - query_ast = search_request.query_ast.as_str(), - agg = search_request.aggregation_request(), - start_ts = ?(search_request.start_timestamp()..search_request.end_timestamp()), - count_required = search_request.count_hits().as_str_name(), - num_docs = num_docs, - num_splits = num_splits, - "root_search" - ); - - if let Some(max_total_split_searches) = searcher_context.searcher_config.max_splits_per_search - && max_total_split_searches < num_splits - { - tracing::error!( - num_splits, - max_total_split_searches, - index=?PrettySample::new(search_request.index_id_patterns, 5), - query=%search_request.query_ast, - "max total splits exceeded" - ); - return Err(SearchError::InvalidArgument(format!( - "Number of targeted splits {num_splits} exceeds the limit {max_total_split_searches}" - ))); - } + let targeted_splits_bytes: u64 = split_metadatas + .iter() + .map(|split| split.footer_offsets.end) + .sum(); + let num_targeted_splits = split_metadatas.len(); + record_all!(req_span, targeted_splits_bytes, num_targeted_splits); let mut search_response_result = RootSearchMetricsFuture { start: start_instant, + user_agent: search_request.user_agent.clone().unwrap_or_default(), tracked: root_search_aux( searcher_context, &indexes_meta_for_leaf_search, @@ -1313,7 +1326,8 @@ pub async fn root_search( cluster_client, ), status: None, - num_targeted_splits: num_splits, + num_targeted_splits, + req_span, } .await; @@ -1397,12 +1411,11 @@ pub async fn search_plan( } else { 0 }; - let sstable_query_count = warmup_info.term_dict_fields.len() - + warmup_info - .terms_grouped_by_field - .values() - .map(|terms: &HashMap| terms.len()) - .sum::() + let sstable_query_count = warmup_info + .terms_grouped_by_field + .values() + .map(|terms: &HashMap| terms.len()) + .sum::() + warmup_info .term_ranges_grouped_by_field .values() @@ -1448,10 +1461,9 @@ pub async fn search_plan( /// Converts search after with datetime format to nanoseconds (representation in tantivy). /// If the sort field is a datetime field and no datetime format is set, the default format is /// milliseconds. -/// `sort_fields_are_datetime_opt` must be of the same length as `search_request.sort_fields`. fn convert_search_after_datetime_values( search_request: &mut SearchRequest, - sort_fields_is_datetime: &HashMap, + sort_fields_is_datetime: &SortFieldsIsDatetime, ) -> crate::Result<()> { for sort_field in search_request.sort_fields.iter_mut() { if *sort_fields_is_datetime @@ -1488,79 +1500,57 @@ fn convert_search_after_datetime_values( Ok(()) } -/// Convert sort values from input datetime format into nanoseconds. -/// The conversion is done only for U64 and I64 sort values, an error is returned for other types. +/// Converts a numerical sort value from the given input datetime format into a `Datetime` sort +/// value (nanoseconds, tantivy's internal datetime representation). +/// Only `U64` and `I64` sort values are accepted; an error is returned for other types. fn convert_sort_datetime_value_into_nanos( sort_value: &mut SortValue, input_format: SortDatetimeFormat, ) -> crate::Result<()> { - match sort_value { - SortValue::U64(value) => match input_format { - SortDatetimeFormat::UnixTimestampMillis => { - *value = value.checked_mul(1_000_000).ok_or_else(|| { - SearchError::Internal(format!( - "sort value defined in milliseconds is too large and cannot be converted \ - into nanoseconds: {value}" - )) - })?; - } - SortDatetimeFormat::UnixTimestampNanos => { - // Nothing to do as the internal format is nanos. - } - }, - SortValue::I64(value) => match input_format { - SortDatetimeFormat::UnixTimestampMillis => { - *value = value.checked_mul(1_000_000).ok_or_else(|| { - SearchError::Internal(format!( - "sort value defined in milliseconds is too large and cannot be converted \ - into nanoseconds: {value}" - )) - })?; - } - SortDatetimeFormat::UnixTimestampNanos => { - // Nothing to do as the internal format is nanos. - } - }, + // Normalise to i64, even though in theory the sort value should be parsed as i64 anyway. + let raw: i64 = match sort_value { + SortValue::U64(value) => i64::try_from(*value).map_err(|_| { + SearchError::Internal(format!( + "sort value is too large to be represented as a datetime: {value}" + )) + })?, + SortValue::I64(value) => *value, _ => { return Err(SearchError::Internal(format!( - "datetime conversion are only support for u64 and i64 sort values, not \ + "datetime conversion is only supported for u64 and i64 sort values, not \ `{sort_value:?}`" ))); } - } + }; + let nanos: i64 = match input_format { + SortDatetimeFormat::UnixTimestampMillis => raw.checked_mul(1_000_000).ok_or_else(|| { + SearchError::Internal(format!( + "sort value defined in milliseconds is too large to be a timestamp: {raw}" + )) + })?, + SortDatetimeFormat::UnixTimestampNanos => raw, + }; + *sort_value = SortValue::Datetime(nanos); Ok(()) } -/// Convert sort values from nanoseconds to the requested output format. -/// The conversion is done only for U64 and I64 sort values, an error is returned for other types. -fn convert_sort_datetime_value( +/// Converts a `Datetime` sort value (nanoseconds, tantivy's internal representation) into the +/// requested output format, replacing the value in place. +/// +/// Only the `Datetime` variant is accepted; an error is returned for other types. +fn convert_sort_datetime_value_from_nanos( sort_value: &mut SortValue, output_format: SortDatetimeFormat, ) -> crate::Result<()> { - match sort_value { - SortValue::U64(value) => match output_format { - SortDatetimeFormat::UnixTimestampMillis => { - *value /= 1_000_000; - } - SortDatetimeFormat::UnixTimestampNanos => { - // Nothing todo as the internal format is in nanos. - } - }, - SortValue::I64(value) => match output_format { - SortDatetimeFormat::UnixTimestampMillis => { - *value /= 1_000_000; - } - SortDatetimeFormat::UnixTimestampNanos => { - // Nothing todo as the internal format is in nanos. - } - }, - _ => { - return Err(SearchError::Internal(format!( - "datetime conversion are only support for u64 and i64 sort values, not \ - `{sort_value:?}`" - ))); - } - } + let SortValue::Datetime(nanos) = sort_value else { + return Err(SearchError::Internal(format!( + "datetime conversion is only supported for datetime sort values, not `{sort_value:?}`" + ))); + }; + *sort_value = match output_format { + SortDatetimeFormat::UnixTimestampMillis => SortValue::I64(*nanos / 1_000_000), + SortDatetimeFormat::UnixTimestampNanos => SortValue::I64(*nanos), + }; Ok(()) } @@ -2179,27 +2169,65 @@ mod tests { #[test] fn test_convert_sort_datetime_value() { - let mut sort_value = SortValue::U64(1617000000000000000); - convert_sort_datetime_value(&mut sort_value, SortDatetimeFormat::UnixTimestampMillis) - .unwrap(); - assert_eq!(sort_value, SortValue::U64(1617000000000)); - let mut sort_value = SortValue::I64(1617000000000000000); - convert_sort_datetime_value(&mut sort_value, SortDatetimeFormat::UnixTimestampMillis) - .unwrap(); + // millis output + let mut sort_value = SortValue::Datetime(1617000000000000000); + convert_sort_datetime_value_from_nanos( + &mut sort_value, + SortDatetimeFormat::UnixTimestampMillis, + ) + .unwrap(); assert_eq!(sort_value, SortValue::I64(1617000000000)); - // conversion with float values should fail. + // nanos output + let mut sort_value = SortValue::Datetime(1617000000000000000); + convert_sort_datetime_value_from_nanos( + &mut sort_value, + SortDatetimeFormat::UnixTimestampNanos, + ) + .unwrap(); + assert_eq!(sort_value, SortValue::I64(1617000000000000000)); + + // non-datetime values should fail. let mut sort_value = SortValue::F64(1617000000000000000.0); - let error = - convert_sort_datetime_value(&mut sort_value, SortDatetimeFormat::UnixTimestampMillis) - .unwrap_err(); + let error = convert_sort_datetime_value_from_nanos( + &mut sort_value, + SortDatetimeFormat::UnixTimestampMillis, + ) + .unwrap_err(); assert_eq!( error.to_string(), - "internal error: `datetime conversion are only support for u64 and i64 sort values, \ - not `F64(1.617e18)``" + "internal error: `datetime conversion is only supported for datetime sort values, not \ + `F64(1.617e18)``" ); } + #[test] + fn test_sort_datetime_value_roundtrip() { + use quickwit_proto::search::SortByValue; + let nanos: i64 = 1617000000000000000; + + for format in [ + SortDatetimeFormat::UnixTimestampMillis, + SortDatetimeFormat::UnixTimestampNanos, + ] { + let mut sort_value = SortValue::Datetime(nanos); + convert_sort_datetime_value_from_nanos(&mut sort_value, format).unwrap(); + + let json = SortByValue::from(sort_value).into_json(); + + let sort_by_value = SortByValue::try_from_json(json).unwrap(); + let mut sort_value = sort_by_value.sort_value.unwrap(); + + convert_sort_datetime_value_into_nanos(&mut sort_value, format).unwrap(); + + assert_eq!( + sort_value, + SortValue::Datetime(nanos), + "roundtrip failed for format {format:?}" + ); + } + } + #[test] fn test_convert_sort_datetime_value_into_nanos() { let mut sort_value = SortValue::U64(1617000000000); @@ -2208,39 +2236,29 @@ mod tests { SortDatetimeFormat::UnixTimestampMillis, ) .unwrap(); - assert_eq!(sort_value, SortValue::U64(1617000000000000000)); + assert_eq!(sort_value, SortValue::Datetime(1617000000000000000)); let mut sort_value = SortValue::I64(1617000000000); convert_sort_datetime_value_into_nanos( &mut sort_value, SortDatetimeFormat::UnixTimestampMillis, ) .unwrap(); - assert_eq!(sort_value, SortValue::I64(1617000000000000000)); + assert_eq!(sort_value, SortValue::Datetime(1617000000000000000)); // conversion with a too large millisecond value should fail. let mut sort_value = SortValue::I64(1617000000000000); - let error = convert_sort_datetime_value_into_nanos( + convert_sort_datetime_value_into_nanos( &mut sort_value, SortDatetimeFormat::UnixTimestampMillis, ) .unwrap_err(); - assert_eq!( - error.to_string(), - "internal error: `sort value defined in milliseconds is too large and cannot be \ - converted into nanoseconds: 1617000000000000`" - ); // conversion with float values should fail. let mut sort_value = SortValue::F64(1617000000000000.0); - let error = convert_sort_datetime_value_into_nanos( + convert_sort_datetime_value_into_nanos( &mut sort_value, SortDatetimeFormat::UnixTimestampMillis, ) .unwrap_err(); - assert_eq!( - error.to_string(), - "internal error: `datetime conversion are only support for u64 and i64 sort values, \ - not `F64(1617000000000000.0)``" - ); } #[test] @@ -2411,7 +2429,7 @@ mod tests { let timestamp_field = schema_builder.add_date_field("timestamp", FAST); let id_field = schema_builder.add_u64_field("id", FAST); let no_fast_field = schema_builder.add_u64_field("no_fast", STORED); - let text_field = schema_builder.add_text_field("text", STORED); + let text_field = schema_builder.add_text_field("text", FAST); let schema = schema_builder.build(); { let sort_by_field_entry = schema.get_field_entry(timestamp_field); @@ -2439,11 +2457,7 @@ mod tests { } { let sort_by_field_entry = schema.get_field_entry(text_field); - let error = validate_sort_by_field_type(sort_by_field_entry, true).unwrap_err(); - assert_eq!( - error.to_string(), - "Invalid argument: sort by field on type text is currently not supported `text`" - ); + validate_sort_by_field_type(sort_by_field_entry, false).unwrap(); } } @@ -2987,9 +3001,9 @@ mod tests { query_ast: qast_json_helper("test", &["body"]), max_hits: 10, sort_fields: vec![SortField { - field_name: "response_date".to_string(), + field_name: "response_time".to_string(), sort_order: SortOrder::Asc.into(), - sort_datetime_format: Some(SortDatetimeFormat::UnixTimestampNanos as i32), + ..Default::default() }], ..Default::default() }; @@ -3169,9 +3183,9 @@ mod tests { query_ast: qast_json_helper("test", &["body"]), max_hits: 10, sort_fields: vec![SortField { - field_name: "response_date".to_string(), + field_name: "response_time".to_string(), sort_order: SortOrder::Desc.into(), - sort_datetime_format: Some(SortDatetimeFormat::UnixTimestampNanos as i32), + ..Default::default() }], ..Default::default() }; @@ -5356,7 +5370,7 @@ mod tests { ) .await .unwrap_err(); - assert!(matches!(search_error, SearchError::InvalidArgument { .. })); + assert!(matches!(search_error, SearchError::TooManySplits { .. })); Ok(()) } } diff --git a/quickwit/quickwit-search/src/search_permit_provider.rs b/quickwit/quickwit-search/src/search_permit_provider.rs index fac7c5e2e3e..502c54710a3 100644 --- a/quickwit/quickwit-search/src/search_permit_provider.rs +++ b/quickwit/quickwit-search/src/search_permit_provider.rs @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::VecDeque; +use std::collections::BinaryHeap; +use std::collections::binary_heap::PeekMut; use std::future::Future; use std::pin::Pin; use std::task::{Context, Poll}; @@ -28,11 +29,11 @@ use crate::metrics::SearchTaskMetrics; /// Distributor of permits to perform split search operation. /// -/// Requests are served in order. Each permit initially reserves a slot for the -/// warmup (limit concurrent downloads) and a pessimistic amount of memory. Once -/// the warmup is completed, the actual memory usage is set and the warmup slot -/// is released. Once the search is completed and the permit is dropped, the -/// remaining memory is also released. +/// Requests are served in order. Each permit reserves a slot for concurrent +/// search execution and a pessimistic amount of memory. The slot is held for +/// the entire duration of the search. Once the actual memory usage is known, +/// it can be updated via `update_memory_usage()`. When the permit is dropped, +/// both the search slot and memory are released. #[derive(Clone)] pub struct SearchPermitProvider { message_sender: mpsc::UnboundedSender, @@ -48,10 +49,8 @@ pub enum SearchPermitMessage { UpdateMemory { memory_delta: i64, }, - FreeWarmupSlot, Drop { memory_size: u64, - warmup_slot_freed: bool, }, } @@ -81,7 +80,7 @@ pub fn compute_initial_memory_allocation( impl SearchPermitProvider { pub fn new( - num_download_slots: usize, + max_num_concurrent_split_searches: usize, memory_budget: ByteSize, metrics: SearchTaskMetrics, ) -> Self { @@ -91,9 +90,9 @@ impl SearchPermitProvider { let actor = SearchPermitActor { msg_receiver: message_receiver, msg_sender: message_sender.downgrade(), - num_warmup_slots_available: num_download_slots, + num_search_slots_available: max_num_concurrent_split_searches, total_memory_budget: memory_budget.as_u64(), - permits_requests: VecDeque::new(), + permits_requests: BinaryHeap::new(), total_memory_allocated: 0u64, #[cfg(test)] stopped: state_sender, @@ -118,8 +117,11 @@ impl SearchPermitProvider { &self, splits: impl IntoIterator, ) -> Vec { + let permit_sizes: Vec = splits.into_iter().map(|size| size.as_u64()).collect(); + if permit_sizes.is_empty() { + return Vec::new(); + } let (permit_sender, permit_receiver) = oneshot::channel(); - let permit_sizes = splits.into_iter().map(|size| size.as_u64()).collect(); self.message_sender .send(SearchPermitMessage::Request { permit_sender, @@ -136,17 +138,91 @@ struct SearchPermitActor { metrics: SearchTaskMetrics, msg_receiver: mpsc::UnboundedReceiver, msg_sender: mpsc::WeakUnboundedSender, - num_warmup_slots_available: usize, + num_search_slots_available: usize, /// Note it is possible for memory_allocated to exceed memory_budget temporarily, /// if and only if a split leaf search task ended up using more than `initial_allocation`. /// When it happens, new permits will not be assigned until the memory is freed. total_memory_budget: u64, total_memory_allocated: u64, - permits_requests: VecDeque<(oneshot::Sender, u64)>, + permits_requests: BinaryHeap, #[cfg(test)] stopped: watch::Sender, } +struct SingleSplitPermitRequest { + permit_sender: oneshot::Sender, + permit_size: u64, +} + +struct LeafPermitRequest { + /// Single split permit requests for this leaf search. + single_split_permit_requests: std::vec::IntoIter, +} + +impl Ord for LeafPermitRequest { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // we compare other with self and not the other way around because we want a min-heap and + // Rust's is a max-heap + other + .single_split_permit_requests + .as_slice() + .len() + .cmp(&self.single_split_permit_requests.as_slice().len()) + } +} + +impl PartialOrd for LeafPermitRequest { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for LeafPermitRequest { + fn eq(&self, other: &Self) -> bool { + self.cmp(other).is_eq() + } +} + +impl Eq for LeafPermitRequest {} + +impl LeafPermitRequest { + fn from_estimated_costs(permit_sizes: Vec) -> (Self, Vec) { + let mut permits = Vec::with_capacity(permit_sizes.len()); + let mut single_split_permit_requests = Vec::with_capacity(permit_sizes.len()); + for permit_size in permit_sizes { + let (tx, rx) = oneshot::channel(); + // we keep our internal list of permits and the returned wait handles in the + // same order to make sure we emit each permit in the right order. Doing otherwise + // may cause deadlocks + single_split_permit_requests.push(SingleSplitPermitRequest { + permit_sender: tx, + permit_size, + }); + permits.push(SearchPermitFuture(rx)); + } + ( + LeafPermitRequest { + single_split_permit_requests: single_split_permit_requests.into_iter(), + }, + permits, + ) + } + + fn pop_if_smaller_than(&mut self, max_size: u64) -> Option { + // IntoIter::as_slice() allows us to peek at the next element without consuming it + match self.single_split_permit_requests.as_slice().first() { + Some(request) if request.permit_size <= max_size => { + self.single_split_permit_requests.next() + } + _ => None, + } + } + + fn is_empty(&self) -> bool { + self.single_split_permit_requests.as_slice().is_empty() + } +} + impl SearchPermitActor { async fn run(mut self) { // Stops when the last clone of SearchPermitProvider is dropped. @@ -163,12 +239,14 @@ impl SearchPermitActor { permit_sizes, permit_sender, } => { - let mut permits = Vec::with_capacity(permit_sizes.len()); - for permit_size in permit_sizes { - let (tx, rx) = oneshot::channel(); - self.permits_requests.push_back((tx, permit_size)); - permits.push(SearchPermitFuture(rx)); - } + assert_ne!( + permit_sizes.len(), + 0, + "empty permit request would lead to deadlock" + ); + let (leaf_permit_request, permits) = + LeafPermitRequest::from_estimated_costs(permit_sizes); + self.permits_requests.push(leaf_permit_request); self.assign_available_permits(); // The receiver could be dropped in the (unlikely) situation // where the future requesting these permits is cancelled before @@ -183,17 +261,8 @@ impl SearchPermitActor { (self.total_memory_allocated as i64 + memory_delta) as u64; self.assign_available_permits(); } - SearchPermitMessage::FreeWarmupSlot => { - self.num_warmup_slots_available += 1; - self.assign_available_permits(); - } - SearchPermitMessage::Drop { - memory_size, - warmup_slot_freed, - } => { - if !warmup_slot_freed { - self.num_warmup_slots_available += 1; - } + SearchPermitMessage::Drop { memory_size } => { + self.num_search_slots_available += 1; self.total_memory_allocated = self .total_memory_allocated .checked_sub(memory_size) @@ -203,41 +272,48 @@ impl SearchPermitActor { } } - fn pop_next_request_if_serviceable(&mut self) -> Option<(oneshot::Sender, u64)> { - if self.num_warmup_slots_available == 0 { + fn pop_next_request_if_serviceable(&mut self) -> Option { + if self.num_search_slots_available == 0 { return None; } - if let Some((_, next_permit_size)) = self.permits_requests.front() - && self.total_memory_allocated + next_permit_size <= self.total_memory_budget - { - return self.permits_requests.pop_front(); + let available_memory = self + .total_memory_budget + .checked_sub(self.total_memory_allocated)?; + let mut peeked = self.permits_requests.peek_mut()?; + + if let Some(permit_request) = peeked.pop_if_smaller_than(available_memory) { + if peeked.is_empty() { + PeekMut::pop(peeked); + } + return Some(permit_request); } None } fn assign_available_permits(&mut self) { - let ongoing_tasks_metric = self.metrics.ongoing_tasks; - while let Some((permit_requester_tx, next_permit_size)) = - self.pop_next_request_if_serviceable() - { + while let Some(permit_request) = self.pop_next_request_if_serviceable() { + let ongoing_tasks_metric = self.metrics.ongoing_tasks; let mut ongoing_gauge_guard = GaugeGuard::from_gauge(ongoing_tasks_metric); ongoing_gauge_guard.add(1); - self.total_memory_allocated += next_permit_size; - self.num_warmup_slots_available -= 1; - permit_requester_tx + self.total_memory_allocated += permit_request.permit_size; + self.num_search_slots_available -= 1; + permit_request + .permit_sender .send(SearchPermit { _ongoing_gauge_guard: ongoing_gauge_guard, msg_sender: self.msg_sender.clone(), - memory_allocation: next_permit_size, - warmup_slot_freed: false, + memory_allocation: permit_request.permit_size, }) // if the requester dropped its receiver, we drop the newly // created SearchPermit which releases the resources .ok(); } - self.metrics - .pending_tasks - .set(self.permits_requests.len() as i64); + let pending_tasks = self + .permits_requests + .iter() + .map(|leaf_req| leaf_req.single_split_permit_requests.as_slice().len() as i64) + .sum(); + self.metrics.pending_tasks.set(pending_tasks); } } @@ -245,7 +321,6 @@ pub struct SearchPermit { _ongoing_gauge_guard: GaugeGuard<'static>, msg_sender: mpsc::WeakUnboundedSender, memory_allocation: u64, - warmup_slot_freed: bool, } impl SearchPermit { @@ -259,16 +334,6 @@ impl SearchPermit { self.send_if_still_running(SearchPermitMessage::UpdateMemory { memory_delta }); } - /// Drop the warmup permit, allowing more downloads to be started. Only one - /// slot is attached to each permit so calling this again has no effect. - pub fn free_warmup_slot(&mut self) { - if self.warmup_slot_freed { - return; - } - self.warmup_slot_freed = true; - self.send_if_still_running(SearchPermitMessage::FreeWarmupSlot); - } - pub fn memory_allocation(&self) -> ByteSize { ByteSize(self.memory_allocation) } @@ -288,7 +353,6 @@ impl Drop for SearchPermit { fn drop(&mut self) { self.send_if_still_running(SearchPermitMessage::Drop { memory_size: self.memory_allocation, - warmup_slot_freed: self.warmup_slot_freed, }); } } @@ -324,6 +388,18 @@ mod tests { SEARCH_METRICS.search_task_metrics() } + #[tokio::test] + async fn test_get_permits_empty() { + let permit_provider = SearchPermitProvider::new(1, ByteSize::mb(100), test_metrics()); + let permits = permit_provider.get_permits(std::iter::empty()).await; + assert!(permits.is_empty()); + + // Subsequent non-empty requests must still be served normally. + let permits = permit_provider.get_permits([ByteSize::mb(10)]).await; + assert_eq!(permits.len(), 1); + let _permit = permits.into_iter().next().unwrap().await; + } + #[tokio::test] async fn test_search_permit_order() { let permit_provider = SearchPermitProvider::new(1, ByteSize::mb(100), test_metrics()); @@ -374,6 +450,75 @@ mod tests { } } + #[tokio::test] + async fn test_search_permit_order_with_concurrent_search() { + let permit_provider = SearchPermitProvider::new(4, ByteSize::mb(100), test_metrics()); + let mut all_futures = Vec::new(); + let first_batch_of_permits = permit_provider + .get_permits(repeat_n(ByteSize::mb(10), 8)) + .await; + assert_eq!(first_batch_of_permits.len(), 8); + all_futures.extend( + first_batch_of_permits + .into_iter() + .enumerate() + .map(move |(i, fut)| ((1, i), fut)), + ); + + let second_batch_of_permits = permit_provider + .get_permits(repeat_n(ByteSize::mb(10), 2)) + .await; + all_futures.extend( + second_batch_of_permits + .into_iter() + .enumerate() + .map(move |(i, fut)| ((2, i), fut)), + ); + + let third_batch_of_permits = permit_provider + .get_permits(repeat_n(ByteSize::mb(10), 6)) + .await; + all_futures.extend( + third_batch_of_permits + .into_iter() + .enumerate() + .map(move |(i, fut)| ((3, i), fut)), + ); + + // not super useful, considering what join set does, but still a tiny bit more sound. + all_futures.shuffle(&mut rand::rng()); + + let mut join_set = JoinSet::new(); + for (res, fut) in all_futures { + join_set.spawn(async move { + let permit = fut.await; + (res, permit) + }); + } + let mut ordered_result: Vec<(usize, usize)> = Vec::with_capacity(20); + while let Some(Ok(((batch_id, order), _permit))) = join_set.join_next().await { + ordered_result.push((batch_id, order)); + } + + let mut counters = [0; 4]; + let expected_result: Vec<(usize, usize)> = [ + 1, 1, 1, 1, // initial 4 permits + 2, 2, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, + ] + .into_iter() + .map(|batch_id| { + let order = counters[batch_id]; + counters[batch_id] += 1; + (batch_id, order) + }) + .collect(); + + // for the first 4 permits, the order is not well defined as they are all granted at once, + // and we poll futures in a random order. We sort them to fix that artifact + ordered_result[..4].sort(); + assert_eq!(ordered_result, expected_result); + } + #[tokio::test] async fn test_search_permit_early_drops() { let permit_provider = SearchPermitProvider::new(1, ByteSize::mb(100), test_metrics()); @@ -446,7 +591,7 @@ mod tests { } #[tokio::test] - async fn test_warmup_slot() { + async fn test_concurrent_search_slots() { let permit_provider = SearchPermitProvider::new(10, ByteSize::mb(100), test_metrics()); let mut permit_futs = permit_provider .get_permits(repeat_n(ByteSize::mb(1), 16)) @@ -458,27 +603,19 @@ mod tests { .buffered(1) .collect() .await; - // the next permit is blocked by the warmup slots + // the next permit is blocked by the concurrent search slots let next_blocked_permit_fut = remaining_permit_futs.next().unwrap(); try_get(next_blocked_permit_fut).await.err().unwrap(); // if we drop one of the permits, we can get a new one permits.drain(0..1); let next_permit_fut = remaining_permit_futs.next().unwrap(); permits.push(try_get(next_permit_fut).await.unwrap()); - // the next permit is blocked again by the warmup slots + // the next permit is blocked again by the concurrent search slots let next_blocked_permit_fut = remaining_permit_futs.next().unwrap(); try_get(next_blocked_permit_fut).await.err().unwrap(); - // we can explicitly free the warmup slot on a permit - permits[0].free_warmup_slot(); + // dropping a permit frees up a slot + permits.drain(0..1); let next_permit_fut = remaining_permit_futs.next().unwrap(); permits.push(try_get(next_permit_fut).await.unwrap()); - // dropping that same permit does not free up another slot - permits.drain(0..1); - let next_blocked_permit_fut = remaining_permit_futs.next().unwrap(); - try_get(next_blocked_permit_fut).await.err().unwrap(); - // but dropping a permit for which the slot wasn't explicitly free does free up a slot - permits.drain(0..1); - let next_blocked_permit_fut = remaining_permit_futs.next().unwrap(); - permits.push(try_get(next_blocked_permit_fut).await.unwrap()); } } diff --git a/quickwit/quickwit-search/src/search_response_rest.rs b/quickwit/quickwit-search/src/search_response_rest.rs index 58eddc7b927..c2cbafaf392 100644 --- a/quickwit/quickwit-search/src/search_response_rest.rs +++ b/quickwit/quickwit-search/src/search_response_rest.rs @@ -52,8 +52,6 @@ pub struct SearchResponseRest { pub snippets: Option>, /// Elapsed time. pub elapsed_time_micros: u64, - /// Search errors. - pub errors: Vec, /// Aggregations. #[schema(value_type = Object)] #[serde(skip_serializing_if = "Option::is_none")] @@ -107,7 +105,6 @@ impl TryFrom for SearchResponseRest { hits: documents, snippets: snippet_opt, elapsed_time_micros: search_response.elapsed_time_micros, - errors: search_response.errors, aggregations: aggregations_opt, }) } diff --git a/quickwit/quickwit-search/src/service.rs b/quickwit/quickwit-search/src/service.rs index 890052a5053..71efc16959f 100644 --- a/quickwit/quickwit-search/src/service.rs +++ b/quickwit/quickwit-search/src/service.rs @@ -406,7 +406,6 @@ pub(crate) async fn scroll( num_hits: scroll_context.total_num_hits, elapsed_time_micros: start.elapsed().as_micros() as u64, scroll_id: Some(next_scroll_id.to_string()), - errors: Vec::new(), aggregation_postcard: None, failed_splits: scroll_context.failed_splits, num_successful_splits: scroll_context.num_successful_splits, diff --git a/quickwit/quickwit-search/src/soft_delete_query.rs b/quickwit/quickwit-search/src/soft_delete_query.rs new file mode 100644 index 00000000000..8283523359d --- /dev/null +++ b/quickwit/quickwit-search/src/soft_delete_query.rs @@ -0,0 +1,321 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt; +use std::sync::Arc; + +use tantivy::query::{EnableScoring, Exclude, Explanation, Query, QueryClone, Scorer, Weight}; +use tantivy::{DocId, DocSet, Score, SegmentReader, TERMINATED}; + +/// A [`DocSet`] backed by a sorted, deduplicated vector of doc IDs. +/// +/// Used as the excluding [`DocSet`] argument passed to [`Exclude`] when +/// constructing a scorer inside [`SoftDeleteWeight`]. +/// +/// # Invariant +/// +/// The underlying slice must be sorted in strictly ascending order and free of +/// duplicates. This is guaranteed by [`SoftDeleteQuery::new`], which sorts and +/// deduplicates the input before storing it. +struct SortedDocIdSet { + doc_ids: Arc>, + /// Index of the current document inside `doc_ids`. + cursor: usize, +} + +impl SortedDocIdSet { + fn new(doc_ids: Arc>) -> Self { + SortedDocIdSet { doc_ids, cursor: 0 } + } +} + +impl DocSet for SortedDocIdSet { + #[inline] + fn advance(&mut self) -> DocId { + self.cursor += 1; + self.doc() + } + + fn seek(&mut self, target: DocId) -> DocId { + // The DocSet contract guarantees seek() is always called with a + // non-decreasing target, so we only need to scan forward from cursor. + let remaining = self.doc_ids.get(self.cursor..).unwrap_or(&[]); + let offset = remaining.partition_point(|&id| id < target); + self.cursor += offset; + self.doc() + } + + #[inline] + fn doc(&self) -> DocId { + self.doc_ids.get(self.cursor).copied().unwrap_or(TERMINATED) + } + + fn size_hint(&self) -> u32 { + self.doc_ids.len().saturating_sub(self.cursor) as u32 + } +} + +/// [`Weight`] produced by [`SoftDeleteQuery`]. +/// +/// Wraps the inner weight's scorer with [`Exclude`] to filter out +/// soft-deleted doc IDs transparently across all collection paths. +struct SoftDeleteWeight { + inner: Box, + deleted_doc_ids: Arc>, +} + +impl Weight for SoftDeleteWeight { + fn scorer(&self, reader: &SegmentReader, boost: Score) -> tantivy::Result> { + let inner_scorer = self.inner.scorer(reader, boost)?; + let excluded = SortedDocIdSet::new(Arc::clone(&self.deleted_doc_ids)); + Ok(Box::new(Exclude::new(inner_scorer, excluded))) + } + + fn explain(&self, reader: &SegmentReader, doc: DocId) -> tantivy::Result { + self.inner.explain(reader, doc) + } +} + +/// A tantivy [`Query`] that wraps another query and excludes a fixed set of +/// soft-deleted doc IDs from every result set it produces. +pub(crate) struct SoftDeleteQuery { + inner: Box, + /// Sorted, deduplicated tantivy doc IDs to exclude. + deleted_doc_ids: Arc>, +} + +impl SoftDeleteQuery { + /// Creates a new [`SoftDeleteQuery`]. + /// + /// `deleted_doc_ids` may be supplied in any order and may contain + /// duplicates; this constructor sorts and deduplicates the input. + pub(crate) fn new(inner: Box, mut deleted_doc_ids: Vec) -> Self { + deleted_doc_ids.sort_unstable(); + deleted_doc_ids.dedup(); + SoftDeleteQuery { + inner, + deleted_doc_ids: Arc::new(deleted_doc_ids), + } + } +} + +impl Clone for SoftDeleteQuery { + fn clone(&self) -> Self { + SoftDeleteQuery { + inner: self.inner.box_clone(), + deleted_doc_ids: Arc::clone(&self.deleted_doc_ids), + } + } +} + +impl fmt::Debug for SoftDeleteQuery { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SoftDeleteQuery") + .field("inner", &self.inner) + .field("num_deleted", &self.deleted_doc_ids.len()) + .finish() + } +} + +impl Query for SoftDeleteQuery { + fn weight(&self, enable_scoring: EnableScoring<'_>) -> tantivy::Result> { + let inner_weight = self.inner.weight(enable_scoring)?; + Ok(Box::new(SoftDeleteWeight { + inner: inner_weight, + deleted_doc_ids: Arc::clone(&self.deleted_doc_ids), + })) + } + + fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a tantivy::Term, bool)) { + self.inner.query_terms(visitor); + } +} + +#[cfg(test)] +mod tests { + use tantivy::collector::Count; + use tantivy::query::AllQuery; + use tantivy::schema::{Schema, TEXT}; + use tantivy::{Index, IndexWriter}; + + use super::*; + + /// Creates a single-segment, in-RAM index containing `num_docs` documents. + /// + /// Returns `(index, reader)`. The tantivy doc IDs are 0-based and + /// contiguous inside the single segment, so doc ID `k` corresponds to the + /// (k+1)-th inserted document. + fn make_index(num_docs: usize) -> tantivy::Result<(Index, tantivy::IndexReader)> { + let mut schema_builder = Schema::builder(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut writer: IndexWriter = index.writer(15_000_000)?; + for i in 0..num_docs { + writer.add_document(tantivy::doc!(text_field => format!("doc {i}")))?; + } + writer.commit()?; + let reader = index.reader()?; + Ok((index, reader)) + } + + // ── SortedDocIdSet unit tests ───────────────────────────────────────────── + + #[test] + fn test_sorted_doc_id_set_advance_through_all() { + let ids = Arc::new(vec![2u32, 5, 8, 11]); + let mut ds = SortedDocIdSet::new(ids); + + assert_eq!(ds.doc(), 2); + assert_eq!(ds.advance(), 5); + assert_eq!(ds.advance(), 8); + assert_eq!(ds.advance(), 11); + // Advancing past the last element returns TERMINATED via unwrap_or. + assert_eq!(ds.advance(), TERMINATED); + assert_eq!(ds.doc(), TERMINATED); + // Subsequent advances keep returning TERMINATED: cursor increments past + // doc_ids.len(), get() returns None, unwrap_or yields TERMINATED. + assert_eq!(ds.advance(), TERMINATED); + } + + #[test] + fn test_sorted_doc_id_set_empty() { + let mut ds = SortedDocIdSet::new(Arc::new(vec![])); + assert_eq!(ds.doc(), TERMINATED); + assert_eq!(ds.advance(), TERMINATED); + assert_eq!(ds.seek(0), TERMINATED); + } + + #[test] + fn test_sorted_doc_id_set_seek_exact_hit() { + let ids = Arc::new(vec![1u32, 3, 7, 10, 15]); + let mut ds = SortedDocIdSet::new(ids); + + assert_eq!(ds.seek(7), 7); + assert_eq!(ds.doc(), 7); + + // Seeking to the same target is idempotent. + assert_eq!(ds.seek(7), 7); + + assert_eq!(ds.seek(10), 10); + assert_eq!(ds.doc(), 10); + } + + #[test] + fn test_sorted_doc_id_set_seek_between_entries() { + let ids = Arc::new(vec![1u32, 3, 7, 10, 15]); + let mut ds = SortedDocIdSet::new(ids); + + // Target falls between 3 and 7 → should return 7. + assert_eq!(ds.seek(4), 7); + assert_eq!(ds.doc(), 7); + + // Target falls between 10 and 15 → should return 15. + assert_eq!(ds.seek(11), 15); + assert_eq!(ds.doc(), 15); + } + + #[test] + fn test_sorted_doc_id_set_seek_past_last_entry() { + let ids = Arc::new(vec![1u32, 3, 7]); + let mut ds = SortedDocIdSet::new(ids); + + assert_eq!(ds.seek(100), TERMINATED); + assert_eq!(ds.doc(), TERMINATED); + } + + #[test] + fn test_sorted_doc_id_set_seek_terminated_sentinel() { + let ids = Arc::new(vec![1u32, 3, 7]); + let mut ds = SortedDocIdSet::new(ids); + + assert_eq!(ds.seek(TERMINATED), TERMINATED); + assert_eq!(ds.doc(), TERMINATED); + } + + #[test] + fn test_sorted_doc_id_set_seek_before_current_position() { + // After advancing past the start, seeking to the current doc must not + // go backwards. + let ids = Arc::new(vec![1u32, 5, 9]); + let mut ds = SortedDocIdSet::new(ids); + + ds.advance(); // cursor → 5 + // Seeking to 5 (= current) must keep returning 5. + assert_eq!(ds.seek(5), 5); + assert_eq!(ds.doc(), 5); + } + + #[test] + fn test_sorted_doc_id_set_size_hint_decrements() { + let ids = Arc::new(vec![1u32, 3, 7, 10]); + let mut ds = SortedDocIdSet::new(ids); + + assert_eq!(ds.size_hint(), 4); + ds.advance(); + assert_eq!(ds.size_hint(), 3); + ds.advance(); + ds.advance(); + ds.advance(); // now TERMINATED + assert_eq!(ds.size_hint(), 0); + } + + #[test] + fn test_soft_delete_query_no_deleted_docs() -> tantivy::Result<()> { + let (_index, reader) = make_index(5)?; + let searcher = reader.searcher(); + + let query = SoftDeleteQuery::new(Box::new(AllQuery), vec![]); + assert_eq!(searcher.search(&query, &Count)?, 5); + Ok(()) + } + + #[test] + fn test_soft_delete_query_excludes_subset() -> tantivy::Result<()> { + let (_index, reader) = make_index(5)?; + let searcher = reader.searcher(); + + // Delete doc IDs 1 and 3; 0, 2, 4 should remain. + let query = SoftDeleteQuery::new(Box::new(AllQuery), vec![1, 3]); + assert_eq!(searcher.search(&query, &Count)?, 3); + Ok(()) + } + + #[test] + fn test_soft_delete_query_excludes_all_docs() -> tantivy::Result<()> { + let (_index, reader) = make_index(3)?; + let searcher = reader.searcher(); + + let query = SoftDeleteQuery::new(Box::new(AllQuery), vec![0, 1, 2]); + assert_eq!(searcher.search(&query, &Count)?, 0); + Ok(()) + } + + #[test] + fn test_soft_delete_query_count_method_matches_search() -> tantivy::Result<()> { + let (_index, reader) = make_index(10)?; + let searcher = reader.searcher(); + + // Delete every even doc ID. + let deleted: Vec = (0..10).filter(|x| x % 2 == 0).collect(); + let query = SoftDeleteQuery::new(Box::new(AllQuery), deleted); + + let count_via_search = searcher.search(&query, &Count)?; + let count_via_method = query.count(&searcher)?; + + assert_eq!(count_via_search, 5); + assert_eq!(count_via_method, 5); + Ok(()) + } +} diff --git a/quickwit/quickwit-search/src/sort_repr.rs b/quickwit/quickwit-search/src/sort_repr.rs new file mode 100644 index 00000000000..940e97366c9 --- /dev/null +++ b/quickwit/quickwit-search/src/sort_repr.rs @@ -0,0 +1,409 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Debug; +use std::ops::Not; + +use quickwit_proto::search::SortOrder; +use tantivy::DocId; + +use crate::top_k_computer::MinValue; + +/// A u64 that can be elided to unit type to save memory. +pub(crate) trait ElidableU64: Ord + Copy + Debug + MinValue { + fn value(self) -> u64; + fn from_u64(value: u64) -> Self; + fn is_elided() -> bool; +} + +impl MinValue for u64 { + fn min_value() -> Self { + 0 + } +} + +impl MinValue for () { + fn min_value() -> Self {} +} + +impl ElidableU64 for u64 { + fn from_u64(value: u64) -> Self { + value + } + fn value(self) -> u64 { + self + } + fn is_elided() -> bool { + false + } +} + +impl ElidableU64 for () { + fn from_u64(_value: u64) -> Self {} + fn value(self) -> u64 { + 0 + } + fn is_elided() -> bool { + true + } +} + +/// Encoded representation of the value, the index of its accessor in the list +/// of fast field columns and the sort order. +/// +/// The first u8 encodes the index of the accessor and a sentinel value for +/// missing and search after values: +/// - 0 is a sentinel for skip all +/// - 1 is a sentinel for missing (always last in the sort order) +/// - other odd values encode the index of the accessor in the list of fast field columns (3 for +/// index 0, 5 for index 1, etc.) +/// - even values are sentinels for search after values that keep/skip all documents for a given +/// column (2 to skip all columns but keep missing, 4 only keeps column 0, 6 keeps column 0 and 1, +/// etc.) +/// +/// The following u64 encodes the value itself or its bitwise negation to +/// reverse the sort order when building an ascending sort (keeping in mind that +/// this is fed to a top-k calculator). +#[derive(Clone, Copy)] +pub(crate) struct InternalValueRepr(u8, V); + +/// Inverts the sort order by reversing the bits. +/// +/// Using the bitwise negation is a cheap way to reverse the order while +/// maintaining the type (and memory footprint). It is also reversible +/// (`not(not(value)) == value`) which makes it simply decodable. +/// +/// This wrapper is just an alias to make the code more readable. Using `!value` +/// or `value.not()` inline yields the same result. +#[inline] +fn reverse>(value: T) -> T { + value.not() +} + +impl InternalValueRepr { + #[inline] + pub fn new(value: u64, accessor_idx: u8, order: SortOrder) -> Self { + // For Asc, smaller values should win: invert so smaller maps to larger repr + match order { + SortOrder::Asc => Self(reverse(accessor_idx * 2 + 3), V::from_u64(reverse(value))), + SortOrder::Desc => Self(accessor_idx * 2 + 3, V::from_u64(value)), + } + } + /// A sentinel value that can be instantiated as search after boundary to indicate + /// that all documents should be kept. + pub fn new_keep_column(accessor_idx: u8, order: SortOrder) -> Self { + match order { + SortOrder::Asc => Self(reverse(accessor_idx * 2 + 2), V::from_u64(0)), + SortOrder::Desc => Self(accessor_idx * 2 + 4, V::from_u64(0)), + } + } + #[inline] + pub fn new_missing() -> Self { + // Missing always last in topk, so use the smallest possible value + // (besides the skip_all value) + Self(1, V::from_u64(0)) + } + /// A sentinel value that can be instantiated as search after boundary to indicate + /// that all documents should be skipped for the given column. + pub fn new_skip_column(accessor_idx: u8, order: SortOrder) -> Self { + match order { + SortOrder::Asc => Self(reverse(accessor_idx * 2 + 4), V::from_u64(0)), + SortOrder::Desc => Self(accessor_idx * 2 + 2, V::from_u64(0)), + } + } + /// A sentinel value that can be instantiated as search after boundary to indicate + /// that all documents should be skipped. + pub fn new_skip_all_but_missing() -> Self { + Self(2, V::from_u64(0)) + } + #[inline] + pub fn decode(self, order: SortOrder) -> Option<(u8, u64)> { + if self.0 == 1 { + return None; + } + debug_assert_eq!( + match order { + SortOrder::Asc => reverse(self.0), + SortOrder::Desc => self.0, + } % 2, + 1, + "sentinel indexes are not meant to be decoded" + ); + match order { + SortOrder::Asc => Some(((reverse(self.0) - 3) / 2, reverse(V::value(self.1)))), + SortOrder::Desc => Some(((self.0 - 3) / 2, V::value(self.1))), + } + } +} + +/// Ordered representation of the sort values. It is the concatenation of: +/// - the first two (u8, u64) pairs contain the internal representation of the sort values +/// - the second sort value's internal representation +/// - the doc id, preceeded by a sentinel indicating how it should be used for tie-breaking +/// +/// ElidableU64 is used instead of u64 for sort values to reduce the size of the +/// representation when they are not used. The associated sentinels could also +/// be elided, but in practice they don't have an impact on the tuple's size +/// because the doc id and its sentinel (u8, u32) gets padded anyway. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Default, Hash)] +pub(crate) struct InternalSortValueRepr(u8, V1, u8, V2, u8, u32); + +impl InternalSortValueRepr { + #[inline] + pub fn new( + sort_1: InternalValueRepr, + sort_2: InternalValueRepr, + doc_id: DocId, + doc_id_sort: SortOrder, + ) -> Self { + // For Asc, smaller values should win: invert so smaller maps to larger repr + match doc_id_sort { + SortOrder::Asc => Self(sort_1.0, sort_1.1, sort_2.0, sort_2.1, 1, reverse(doc_id)), + SortOrder::Desc => Self(sort_1.0, sort_1.1, sort_2.0, sort_2.1, 1, doc_id), + } + } + pub fn new_keep_doc_ids(sort_1: InternalValueRepr, sort_2: InternalValueRepr) -> Self { + Self(sort_1.0, sort_1.1, sort_2.0, sort_2.1, 2, 0) + } + pub fn new_skip_doc_ids(sort_1: InternalValueRepr, sort_2: InternalValueRepr) -> Self { + Self(sort_1.0, sort_1.1, sort_2.0, sort_2.1, 0, 0) + } + #[inline] + pub fn sort_1(self) -> InternalValueRepr { + InternalValueRepr(self.0, self.1) + } + #[inline] + pub fn sort_2(self) -> InternalValueRepr { + InternalValueRepr(self.2, self.3) + } + #[inline] + pub fn doc_id(self, order: SortOrder) -> DocId { + debug_assert_eq!(self.4, 1, "doc id sentinel is not meant to be decoded"); + match order { + SortOrder::Asc => reverse(self.5), + SortOrder::Desc => self.5, + } + } + pub fn is_skip_all(&self) -> bool { + *self <= Self(1, V1::min_value(), 1, V2::min_value(), 1, 0) + } +} + +impl MinValue for InternalSortValueRepr { + fn min_value() -> Self { + Self(0, V1::min_value(), 0, V2::min_value(), 1, 0) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_internal_sort_value_repr_ordering_values() { + // Primary sort (Desc v1=10) dominates over secondary (Desc v2=100) and doc_id. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 0, SortOrder::Desc), + InternalValueRepr::::new(100, 0, SortOrder::Desc), + 999, + SortOrder::Desc, + ); + assert!(lhs > rhs, "primary sort must dominate, desc"); + + // Same values but Asc, the order is reversed + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Asc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 0, SortOrder::Asc), + InternalValueRepr::::new(100, 0, SortOrder::Desc), + 999, + SortOrder::Desc, + ); + assert!(lhs < rhs, "primary sort must dominate, asc"); + + // Secondary sort (Desc v2) breaks a tie on the primary field. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new(10, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new(5, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + assert!(lhs > rhs, "secondary sort must break primary tie, desc"); + + // Same values but Asc, the order is reversed. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new(10, 0, SortOrder::Asc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new(5, 0, SortOrder::Asc), + 0, + SortOrder::Desc, + ); + assert!(lhs < rhs, "secondary sort must break primary tie, asc"); + + // Doc-id Desc tiebreaker: higher doc_id wins. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new_missing(), + 10, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new_missing(), + 5, + SortOrder::Desc, + ); + assert!(lhs > rhs, "Desc: higher doc_id must win tiebreaker"); + + // Doc-id Asc tiebreaker: lower doc_id wins. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new_missing(), + 5, + SortOrder::Asc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new_missing(), + 10, + SortOrder::Asc, + ); + assert!(lhs > rhs, "Asc: lower doc_id must win tiebreaker"); + + // Missing values are always smaller + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new_missing(), + InternalValueRepr::::new(10, 0, SortOrder::Desc), + 10, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 0, SortOrder::Desc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + assert!(lhs < rhs, "missing values are always smaller, desc"); + + // Same but Asc, missing is still smaller. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new_missing(), + InternalValueRepr::::new(10, 0, SortOrder::Desc), + 10, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 0, SortOrder::Asc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + assert!(lhs < rhs, "missing values are always smaller, asc"); + } + + #[test] + fn test_internal_sort_value_repr_ordering_sentinels() { + // Doc-id sentinel ordering: skip_doc_ids < normal_doc_id < keep_doc_ids. + let s1 = InternalValueRepr::::new(10, 0, SortOrder::Desc); + let s2 = InternalValueRepr::::new_missing(); + let skip_docs = InternalSortValueRepr::new_skip_doc_ids(s1, s2); + let keep_docs = InternalSortValueRepr::new_keep_doc_ids(s1, s2); + let normal_doc_desc = InternalSortValueRepr::new(s1, s2, 0, SortOrder::Desc); + let normal_doc_asc = InternalSortValueRepr::new(s1, s2, 0, SortOrder::Asc); + assert!( + skip_docs < normal_doc_desc, + "skip_doc_ids must be below normal" + ); + assert!( + normal_doc_desc < keep_docs, + "normal must be below keep_doc_ids" + ); + assert!( + skip_docs < normal_doc_asc, + "skip_doc_ids must be below normal" + ); + assert!( + normal_doc_asc < keep_docs, + "normal must be below keep_doc_ids" + ); + } + + #[test] + fn test_internal_sort_value_repr_ordering_types() { + // Primary accessor ordering dominates all the rest + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 1, SortOrder::Desc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(15, 0, SortOrder::Desc), + InternalValueRepr::::new(100, 0, SortOrder::Desc), + 999, + SortOrder::Desc, + ); + assert!(lhs > rhs, "primary type sort must dominate, desc"); + + // Same values but Asc, the order is reversed + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 1, SortOrder::Asc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(15, 0, SortOrder::Asc), + InternalValueRepr::::new(100, 0, SortOrder::Desc), + 999, + SortOrder::Desc, + ); + assert!(lhs < rhs, "primary type sort must dominate, asc"); + } + + #[test] + fn test_memory_footprint() { + // Make sure that the memory representation is efficiently packed. For + // instance refactoring to: + // ``` + // struct InternalSortValueRepr(InternalValueRepr,InternalValueRepr,u64) + // ``` + // would cause InternalSortValueRepr to jump to 40 bytes. + + assert_eq!(std::mem::size_of::>(), 24); + assert_eq!(std::mem::size_of::>(), 16); + assert_eq!(std::mem::size_of::>(), 8); + } +} diff --git a/quickwit/quickwit-search/src/tests.rs b/quickwit/quickwit-search/src/tests.rs index dc6dfe9f9cd..534e2401ff1 100644 --- a/quickwit/quickwit-search/src/tests.rs +++ b/quickwit/quickwit-search/src/tests.rs @@ -14,6 +14,7 @@ use std::cmp::Ordering; use std::collections::{BTreeMap, BTreeSet}; +use std::vec; use assert_json_diff::{assert_json_eq, assert_json_include}; use quickwit_config::SearcherConfig; @@ -22,8 +23,8 @@ use quickwit_doc_mapper::tag_pruning::extract_tags_from_query; use quickwit_indexing::TestSandbox; use quickwit_opentelemetry::otlp::TraceId; use quickwit_proto::search::{ - LeafListTermsResponse, ListTermsRequest, SearchRequest, SortByValue, SortField, SortOrder, - SortValue, + LeafListTermsResponse, ListTermsRequest, PartialHit, SearchRequest, SortByValue, + SortDatetimeFormat, SortField, SortOrder, SortValue, }; use quickwit_query::query_ast::{ QueryAst, qast_helper, qast_json_helper, query_ast_from_user_text, @@ -179,7 +180,8 @@ async fn test_single_search_with_snippet() -> anyhow::Result<()> { Ok(()) } -async fn slop_search_and_check( +/// Search with "body" as default field and assert expected number of matches. +async fn search_and_check( test_sandbox: &TestSandbox, index_id: &str, query: &str, @@ -234,33 +236,98 @@ async fn test_slop_queries() { ]; test_sandbox.add_documents(docs.clone()).await.unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small bird\"~2", 0) + search_and_check(&test_sandbox, index_id, "\"small bird\"~2", 0) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"red bike\"~2", 1) + search_and_check(&test_sandbox, index_id, "\"red bike\"~2", 1) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small blue bike\"~3", 1) + search_and_check(&test_sandbox, index_id, "\"small blue bike\"~3", 1) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small bike\"", 1) + search_and_check(&test_sandbox, index_id, "\"small bike\"", 1) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small bike\"~1", 2) + search_and_check(&test_sandbox, index_id, "\"small bike\"~1", 2) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small bike\"~2", 2) + search_and_check(&test_sandbox, index_id, "\"small bike\"~2", 2) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small bike\"~3", 3) + search_and_check(&test_sandbox, index_id, "\"small bike\"~3", 3) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"tiny shelter\"~3", 1) + search_and_check(&test_sandbox, index_id, "\"tiny shelter\"~3", 1) .await .unwrap(); test_sandbox.assert_quit().await; } +#[tokio::test] +async fn test_multi_term_queries() { + let index_id = "multi-term-query"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + - name: body + type: text + record: position + "#; + + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["body"]) + .await + .unwrap(); + let docs = vec![ + json!({"title": "one", "body": "a red bike"}), + json!({"title": "two", "body": "a small blue bike"}), + json!({"title": "three", "body": "a small, rusty, and yellow bike"}), + json!({"title": "four", "body": "fred's small bike"}), + json!({"title": "five", "body": "a tiny shelter"}), + ]; + test_sandbox.add_documents(docs.clone()).await.unwrap(); + + search_and_check( + &test_sandbox, + index_id, + "IN [red blue green yellow pink black]", + 3, + ) + .await + .unwrap(); + + search_and_check(&test_sandbox, index_id, "IN [aaaa]", 0) + .await + .unwrap(); + + search_and_check(&test_sandbox, index_id, "IN [red]", 1) + .await + .unwrap(); + + search_and_check(&test_sandbox, index_id, "IN [zzzz]", 0) + .await + .unwrap(); + + search_and_check( + &test_sandbox, + index_id, + "red OR blue OR green OR yellow OR pink OR black", + 3, + ) + .await + .unwrap(); + + search_and_check(&test_sandbox, index_id, "red AND \"small bike\"", 0) + .await + .unwrap(); + + search_and_check(&test_sandbox, index_id, "bike AND \"small bike\"", 1) + .await + .unwrap(); + + test_sandbox.assert_quit().await; +} + #[tokio::test] async fn test_single_node_several_splits() -> anyhow::Result<()> { let index_id = "single-node-several-splits"; @@ -371,7 +438,8 @@ async fn test_single_node_filtering() -> anyhow::Result<()> { test_sandbox.metastore(), test_sandbox.storage_resolver(), ) - .await?; + .await + .unwrap(); assert_eq!(single_node_response.num_hits, 10); assert_eq!(single_node_response.hits.len(), 10); assert!(&single_node_response.hits[0].json.contains("t:19")); @@ -395,7 +463,8 @@ async fn test_single_node_filtering() -> anyhow::Result<()> { test_sandbox.metastore(), test_sandbox.storage_resolver(), ) - .await?; + .await + .unwrap(); assert_eq!(single_node_response.num_hits, 19); assert_eq!(single_node_response.hits.len(), 19); assert!(&single_node_response.hits[0].json.contains("t:19")); @@ -890,7 +959,7 @@ async fn test_sort_by_2_field() { } #[tokio::test] -async fn test_single_node_invalid_sorting_with_query() { +async fn test_sort_by_text() { let index_id = "single-node-invalid-sorting"; let doc_mapping_yaml = r#" field_mappings: @@ -906,7 +975,7 @@ async fn test_single_node_invalid_sorting_with_query() { let mut docs = Vec::new(); for i in 0..30 { - let description = format!("city info-{}", i + 1); + let description = format!("city info-{:02}", i + 1); docs.push(json!({"description": description, "ts": i+1, "temperature": i+32})); } test_sandbox.add_documents(docs).await.unwrap(); @@ -927,13 +996,19 @@ async fn test_single_node_invalid_sorting_with_query() { test_sandbox.metastore(), test_sandbox.storage_resolver(), ) - .await; - assert!(single_node_response.is_err()); - let error_msg = single_node_response.unwrap_err().to_string(); - assert_eq!( - error_msg, - "Invalid argument: sort by field on type text is currently not supported `description`" - ); + .await + .unwrap(); + + assert_eq!(single_node_response.num_hits, 30); + assert_eq!(single_node_response.hits.len(), 15); + assert!(single_node_response.hits.windows(2).all(|hits| { + let hit0: JsonValue = serde_json::from_str(&hits[0].json).unwrap(); + let hit1: JsonValue = serde_json::from_str(&hits[1].json).unwrap(); + hit0["description"].as_str().unwrap() >= hit1["description"].as_str().unwrap() + })); + assert!(single_node_response.hits[0].json.contains("city info-30")); + assert!(single_node_response.hits[14].json.contains("city info-16")); + test_sandbox.assert_quit().await; } @@ -1887,3 +1962,630 @@ fn test_global_doc_address_ser_deser() { let doc_address_deser: GlobalDocAddress = doc_address_string.parse().unwrap(); assert_eq!(doc_address_deser, doc_address); } + +#[tokio::test] +async fn test_single_node_soft_delete_excludes_from_search() -> anyhow::Result<()> { + use quickwit_metastore::IndexMetadataResponseExt; + use quickwit_proto::metastore::{ + IndexMetadataRequest, MetastoreService, SoftDeleteDocumentsRequest, SplitDocIds, + }; + + let index_id = "test-soft-delete-search"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["title"]).await?; + let docs = vec![ + json!({"title": "alpha"}), + json!({"title": "beta"}), + json!({"title": "gamma"}), + ]; + test_sandbox.add_documents(docs).await?; + + // Search all — should find 3 + let search_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["title"]), + max_hits: 10, + ..Default::default() + }; + let result = single_node_search( + search_request.clone(), + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 3); + + // Search for "alpha" specifically to find its doc_id and split_id + let alpha_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("alpha", &["title"]), + max_hits: 10, + ..Default::default() + }; + let alpha_result = single_node_search( + alpha_request, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(alpha_result.num_hits, 1); + let alpha_hit = &alpha_result.hits[0]; + let partial_hit = alpha_hit.partial_hit.as_ref().unwrap(); + let split_id = partial_hit.split_id.clone(); + let doc_id = partial_hit.doc_id; + + // Soft-delete that document via the metastore + let index_uid = test_sandbox + .metastore() + .index_metadata(IndexMetadataRequest::for_index_id(index_id.to_string())) + .await? + .deserialize_index_metadata()? + .index_uid; + + let metastore = test_sandbox.metastore(); + metastore + .soft_delete_documents(SoftDeleteDocumentsRequest { + index_uid: Some(index_uid), + split_doc_ids: vec![SplitDocIds { + split_id: split_id.clone(), + doc_ids: vec![doc_id], + }], + }) + .await?; + + // Search all again — should find only 2 + let result = single_node_search( + search_request, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 2); + + // Verify that the soft-deleted document ("alpha") is not in the results + for hit in &result.hits { + let hit_json: JsonValue = serde_json::from_str(&hit.json)?; + assert_ne!(hit_json["title"], "alpha"); + } + + test_sandbox.assert_quit().await; + Ok(()) +} + +#[tokio::test] +async fn test_single_node_soft_delete_count_only() -> anyhow::Result<()> { + use quickwit_metastore::IndexMetadataResponseExt; + use quickwit_proto::metastore::{ + IndexMetadataRequest, MetastoreService, SoftDeleteDocumentsRequest, SplitDocIds, + }; + + let index_id = "test-soft-delete-count-only"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["title"]).await?; + let docs = vec![ + json!({"title": "alpha"}), + json!({"title": "beta"}), + json!({"title": "gamma"}), + ]; + test_sandbox.add_documents(docs).await?; + + // Count-only search (max_hits: 0) — should find 3 + let count_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["title"]), + max_hits: 0, + ..Default::default() + }; + let result = single_node_search( + count_request.clone(), + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 3); + assert!(result.hits.is_empty()); + + // Find the doc_id for "alpha" so we can soft-delete it + let alpha_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("alpha", &["title"]), + max_hits: 10, + ..Default::default() + }; + let alpha_result = single_node_search( + alpha_request, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(alpha_result.num_hits, 1); + let partial_hit = alpha_result.hits[0].partial_hit.as_ref().unwrap(); + let split_id = partial_hit.split_id.clone(); + let doc_id = partial_hit.doc_id; + + // Soft-delete that document via the metastore + let index_uid = test_sandbox + .metastore() + .index_metadata(IndexMetadataRequest::for_index_id(index_id.to_string())) + .await? + .deserialize_index_metadata()? + .index_uid; + + let metastore = test_sandbox.metastore(); + metastore + .soft_delete_documents(SoftDeleteDocumentsRequest { + index_uid: Some(index_uid), + split_doc_ids: vec![SplitDocIds { + split_id, + doc_ids: vec![doc_id], + }], + }) + .await?; + + // Count-only search again — should find only 2 + let result = single_node_search( + count_request, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 2); + assert!(result.hits.is_empty()); + + test_sandbox.assert_quit().await; + Ok(()) +} + +/// Regression test: the `is_count_only` path (non-MatchAll query with max_hits=0) was calling +/// `query.count(&searcher)` which bypasses Quickwit's soft-delete filter entirely. +/// MatchAll + max_hits=0 goes through `is_metadata_count_request_with_ast` (already correct); +/// this test specifically exercises the `is_count_only` branch with a real term query. +#[tokio::test] +async fn test_single_node_soft_delete_count_only_term_query() -> anyhow::Result<()> { + use quickwit_metastore::IndexMetadataResponseExt; + use quickwit_proto::metastore::{ + IndexMetadataRequest, MetastoreService, SoftDeleteDocumentsRequest, SplitDocIds, + }; + + let index_id = "test-soft-delete-count-only-term-query"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["title"]).await?; + let docs = vec![ + json!({"title": "alpha"}), + json!({"title": "beta"}), + json!({"title": "gamma"}), + ]; + test_sandbox.add_documents(docs).await?; + + // Use a non-MatchAll query so that the `is_count_only` branch is taken instead of + // `is_metadata_count_request_with_ast`. "alpha OR beta OR gamma" matches all 3 docs + // but is not `QueryAst::MatchAll`. + let count_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("alpha OR beta OR gamma", &["title"]), + max_hits: 0, + ..Default::default() + }; + let result = single_node_search( + count_request.clone(), + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 3); + assert!(result.hits.is_empty()); + + // Locate the doc_id for "alpha" so we can soft-delete it. + let alpha_result = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("alpha", &["title"]), + max_hits: 10, + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(alpha_result.num_hits, 1); + let partial_hit = alpha_result.hits[0].partial_hit.as_ref().unwrap(); + let split_id = partial_hit.split_id.clone(); + let doc_id = partial_hit.doc_id; + + // Soft-delete the "alpha" document. + let index_uid = test_sandbox + .metastore() + .index_metadata(IndexMetadataRequest::for_index_id(index_id.to_string())) + .await? + .deserialize_index_metadata()? + .index_uid; + test_sandbox + .metastore() + .soft_delete_documents(SoftDeleteDocumentsRequest { + index_uid: Some(index_uid), + split_doc_ids: vec![SplitDocIds { + split_id, + doc_ids: vec![doc_id], + }], + }) + .await?; + + // Count-only term query: before the fix this returned 3 (soft-deleted doc was counted); + // after the fix it must return 2. + let result = single_node_search( + count_request, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 2); + assert!(result.hits.is_empty()); + + test_sandbox.assert_quit().await; + Ok(()) +} + +/// Tests that when sorting by a datetime field with `sort_datetime_format` set to millis: +/// 1. The sort values returned in `partial_hit` are in milliseconds (not nanoseconds). +/// 2. Those values can be fed back as `search_after` to retrieve the next page correctly. +#[tokio::test] +async fn test_sort_by_datetime_format_millis_and_search_after() -> anyhow::Result<()> { + let index_id = "sort-datetime-millis-search-after"; + let doc_mapping_yaml = r#" + field_mappings: + - name: ts + type: datetime + fast: true + - name: body + type: text + timestamp_field: ts + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["body"]).await?; + + // Index 10 documents with timestamps 100_000_000_000 .. 100_000_009_000 ms since epoch. + let base_secs: i64 = 100_000_000; + let docs: Vec<_> = (0..10) + .map(|i| json!({"ts": base_secs + i, "body": format!("doc {i}")})) + .collect(); + test_sandbox.add_documents(docs).await?; + + let sort_field = SortField { + field_name: "ts".to_string(), + sort_order: SortOrder::Desc as i32, + sort_datetime_format: Some(SortDatetimeFormat::UnixTimestampMillis as i32), + }; + + // Page 1: top 5 hits sorted by ts desc with millis output + let page1 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: vec![sort_field.clone()], + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + + assert_eq!(page1.num_hits, 10); + assert_eq!(page1.hits.len(), 5); + + // Verify sort values are in milliseconds (not nanoseconds) + let expected_millis: Vec = (5..10).rev().map(|i| (base_secs + i) * 1_000).collect(); + let actual_millis: Vec = page1 + .hits + .iter() + .map(|hit| { + let partial_hit = hit.partial_hit.as_ref().unwrap(); + match &partial_hit.sort_value.as_ref().unwrap().sort_value { + Some(SortValue::I64(ms)) => *ms, + other => panic!("expected I64 sort value in millis, got {other:?}"), + } + }) + .collect(); + assert_eq!(actual_millis, expected_millis); + + // Page 2: use the last hit's sort value as search_after + let last_hit = page1.hits.last().unwrap().partial_hit.as_ref().unwrap(); + let search_after = PartialHit { + sort_value: last_hit.sort_value.clone(), + sort_value2: None, + split_id: String::new(), + segment_ord: 0, + doc_id: 0, + }; + + let page2 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: vec![sort_field], + search_after: Some(search_after), + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + + assert_eq!(page2.hits.len(), 5); + // Page 2 should contain docs with timestamps base_secs+4 down to base_secs+0 in millis + let expected_millis_page2: Vec = (0..5).rev().map(|i| (base_secs + i) * 1_000).collect(); + let actual_millis_page2: Vec = page2 + .hits + .iter() + .map(|hit| { + let partial_hit = hit.partial_hit.as_ref().unwrap(); + match &partial_hit.sort_value.as_ref().unwrap().sort_value { + Some(SortValue::I64(ms)) => *ms, + other => panic!("expected I64 sort value in millis, got {other:?}"), + } + }) + .collect(); + assert_eq!(actual_millis_page2, expected_millis_page2); + + test_sandbox.assert_quit().await; + Ok(()) +} + +#[tokio::test] +async fn test_sort_by_dynamic_with_datetime_page_fails() -> anyhow::Result<()> { + let index_id = "sort-dynamic-datetime-page-fails"; + let doc_mapping_yaml = r#" + field_mappings: + - name: ts + type: datetime + fast: true + mode: dynamic + dynamic_mapping: + fast: true + timestamp_field: ts + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["body"]).await?; + + let docs = [ + json!({"ts": 100_000_001, "my_dynamic_field": 2024}), + json!({"ts": 100_000_002, "my_dynamic_field": "2024-03-30T00:00:00Z"}), + json!({"ts": 100_000_001, "my_dynamic_field": 2025}), + json!({"ts": 100_000_002, "my_dynamic_field": "2025-03-30T00:00:00Z"}), + json!({"ts": 100_000_001, "my_dynamic_field": 2026}), + json!({"ts": 100_000_002, "my_dynamic_field": "2026-03-30T00:00:00Z"}), + ]; + test_sandbox.add_documents(docs).await?; + + let sort_field = SortField { + field_name: "my_dynamic_field".to_string(), + sort_order: SortOrder::Desc as i32, + ..Default::default() + }; + + // Page 1: sort should work even on a dynamic field with a datetime column + // values for the first page + let page1 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: vec![sort_field.clone()], + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + + assert_eq!(page1.num_hits, 6); + assert_eq!(page1.hits.len(), 5); + + // Verify sort values are in milliseconds (not nanoseconds) + let page_1_sort_values: Vec<_> = page1 + .hits + .iter() + .map(|hit| { + &hit.partial_hit + .as_ref() + .unwrap() + .sort_value + .as_ref() + .unwrap() + .sort_value + }) + .collect(); + assert_eq!( + page_1_sort_values, + vec![ + &Some(SortValue::Datetime(1774828800000000000)), + &Some(SortValue::Datetime(1743292800000000000)), + &Some(SortValue::Datetime(1711756800000000000)), + &Some(SortValue::I64(2026)), + &Some(SortValue::I64(2025)), + ] + ); + + // Page 2: search after not yet supported + let last_hit = page1.hits.last().unwrap().partial_hit.as_ref().unwrap(); + let search_after = PartialHit { + sort_value: last_hit.sort_value.clone(), + sort_value2: None, + split_id: String::new(), + segment_ord: 0, + doc_id: 0, + }; + + let page2 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: vec![sort_field], + search_after: Some(search_after), + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await + .unwrap(); + + assert_eq!(page2.failed_splits.len(), 1); + assert_eq!(page2.hits.len(), 0); + + test_sandbox.assert_quit().await; + Ok(()) +} + +#[tokio::test] +async fn test_sort_by_two_fields_with_null() -> anyhow::Result<()> { + let index_id = "sort-datetime-millis-search-after"; + let doc_mapping_yaml = r#" + field_mappings: + - name: ts + type: datetime + fast: true + - name: body + type: text + fast: true + timestamp_field: ts + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["body"]).await?; + + // timestamps with 10 digits should be interpreted as secs + let docs: Vec<_> = vec![ + json!({"ts": 1_000_000_001i64, "body": format!("doc 9")}), + json!({"ts": 1_000_000_002i64, "body": format!("doc 8")}), + json!({"ts": 1_000_000_003i64, "body": format!("doc 7")}), + json!({"ts": 1_000_000_004i64}), + json!({"ts": 1_000_000_005i64}), + json!({"ts": 1_000_000_006i64}), + ]; + test_sandbox.add_documents(docs).await?; + + let sort_fields = vec![ + SortField { + field_name: "body".to_string(), + sort_order: SortOrder::Asc as i32, + ..Default::default() + }, + SortField { + field_name: "ts".to_string(), + sort_order: SortOrder::Asc as i32, + sort_datetime_format: Some(SortDatetimeFormat::UnixTimestampMillis as i32), + }, + ]; + + let page1 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: sort_fields.clone(), + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + + assert_eq!(page1.num_hits, 6); + assert_eq!(page1.hits.len(), 5); + let page_1_hits = page1 + .hits + .iter() + .map(|hit| hit.partial_hit.clone().unwrap()) + .collect::>(); + let split_id = page_1_hits[0].split_id.clone(); + // for the timestamp field we convert to sort_datetime_format repr as I64 + assert_eq!( + page_1_hits, + vec![ + PartialHit { + sort_value: Some(SortValue::Str("doc 7".to_string()).into()), + sort_value2: Some(SortValue::I64(1_000_000_003_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 2, + }, + PartialHit { + sort_value: Some(SortValue::Str("doc 8".to_string()).into()), + sort_value2: Some(SortValue::I64(1_000_000_002_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 1, + }, + PartialHit { + sort_value: Some(SortValue::Str("doc 9".to_string()).into()), + sort_value2: Some(SortValue::I64(1_000_000_001_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 0, + }, + PartialHit { + sort_value: Some(SortByValue { sort_value: None }), + sort_value2: Some(SortValue::I64(1_000_000_004_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 3, + }, + PartialHit { + sort_value: Some(SortByValue { sort_value: None }), + sort_value2: Some(SortValue::I64(1_000_000_005_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 4, + }, + ] + ); + + let page2 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: sort_fields.clone(), + search_after: Some(page_1_hits[4].clone()), + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await + .unwrap(); + + assert_eq!(page2.num_hits, 6); + assert_eq!(page2.hits.len(), 1); + let page_2_hits = page2 + .hits + .iter() + .map(|hit| hit.partial_hit.clone().unwrap()) + .collect::>(); + let split_id = page_2_hits[0].split_id.clone(); + // for the timestamp field we convert to sort_datetime_format repr as I64 + assert_eq!( + page_2_hits, + vec![PartialHit { + sort_value: Some(SortByValue { sort_value: None }), + sort_value2: Some(SortValue::I64(1_000_000_006_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 5, + },] + ); + + test_sandbox.assert_quit().await; + Ok(()) +} diff --git a/quickwit/quickwit-search/src/top_k_collector.rs b/quickwit/quickwit-search/src/top_k_collector.rs index f36eb6370e2..3dc9f2bd6f2 100644 --- a/quickwit/quickwit-search/src/top_k_collector.rs +++ b/quickwit/quickwit-search/src/top_k_collector.rs @@ -12,862 +12,179 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::cmp::{Ordering, Reverse}; -use std::fmt::Debug; -use std::marker::PhantomData; +use std::cmp::Ordering; -use quickwit_common::binary_heap::TopK; -use quickwit_proto::search::{PartialHit, SortOrder}; +use quickwit_proto::search::PartialHit; use quickwit_proto::types::SplitId; -use tantivy::{DocId, Score}; +use tantivy::{DocId, Score, SegmentOrdinal}; -use crate::collector::{ - HitSortingMapper, SegmentPartialHit, SegmentPartialHitSortingKey, - SortingFieldExtractorComponent, SortingFieldExtractorPair, -}; +use crate::collector::SortingFieldExtractorPair; +use crate::sort_repr::{ElidableU64, InternalSortValueRepr}; +use crate::top_k_computer::TopKComputer; -pub trait QuickwitSegmentTopKCollector { - fn collect_top_k_block(&mut self, docs: &[DocId]); - fn collect_top_k(&mut self, doc_id: DocId, score: Score); - fn get_top_k(&self) -> Vec; -} - -trait IntoOptionU64 { - #[inline] - fn is_unit_type() -> bool { - false - } - fn into_option_u64(self) -> Option; - fn from_option_u64(value: Option) -> Self; -} -trait MinValue { - fn min_value() -> Self; -} - -impl IntoOptionU64 for Option { - #[inline] - fn into_option_u64(self) -> Option { - self - } - #[inline] - fn from_option_u64(value: Option) -> Self { - value - } -} - -impl MinValue for Option { - #[inline] - fn min_value() -> Self { - None - } -} - -impl IntoOptionU64 for Option> { - #[inline] - fn into_option_u64(self) -> Option { - self.map(|el| el.0) - } - #[inline] - fn from_option_u64(value: Option) -> Self { - value.map(Reverse) - } -} -impl MinValue for Option> { - #[inline] - fn min_value() -> Self { - None - } -} - -impl IntoOptionU64 for () { - #[inline] - fn is_unit_type() -> bool { - true - } - #[inline] - fn into_option_u64(self) -> Option { - None - } - #[inline] - fn from_option_u64(_: Option) -> Self {} -} -impl MinValue for () { - #[inline] - fn min_value() -> Self {} -} - -/// Generic hit struct for top k collector. -/// V1 and V2 are the types of the two values to sort by. -/// They are either Option or _statically_ disabled via unit type. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -struct Hit { - doc_id: DocId, - value1: V1, - value2: V2, -} - -impl MinValue for Hit -where - V1: MinValue, - V2: MinValue, -{ - #[inline] - fn min_value() -> Self { - let doc_id = if REVERSE_DOCID { - DocId::MAX - } else { - DocId::MIN - }; - Hit { - doc_id, - value1: V1::min_value(), - value2: V2::min_value(), - } - } -} - -impl std::fmt::Display for Hit -where - V1: Copy + PartialEq + Eq + PartialOrd + Ord + Debug, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + Debug, -{ - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Hit(doc_id: {}, value1: {:?}, value2: {:?})", - self.doc_id, self.value1, self.value2 - ) - } -} - -impl Ord for Hit -where - V1: Copy + PartialEq + Eq + PartialOrd + Ord + Debug + MinValue, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + Debug + MinValue, -{ - #[inline] - fn cmp(&self, other: &Self) -> Ordering { - let order = self.value1.cmp(&other.value1); - order - .then_with(|| self.value2.cmp(&other.value2)) - .then_with(|| { - if REVERSE_DOCID { - other.doc_id.cmp(&self.doc_id) - } else { - self.doc_id.cmp(&other.doc_id) - } - }) - } -} - -impl PartialOrd for Hit -where - V1: Copy + PartialEq + Eq + PartialOrd + Ord + Debug + MinValue, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + Debug + MinValue, -{ - #[inline] - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl< - V1: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - const REVERSE_DOCID: bool, -> Hit -{ - #[inline] - fn into_segment_partial_hit(self) -> SegmentPartialHit { - SegmentPartialHit { - sort_value: self.value1.into_option_u64(), - sort_value2: self.value2.into_option_u64(), - doc_id: self.doc_id, - } - } -} - -pub fn specialized_top_k_segment_collector( +pub struct QuickwitSegmentTopKCollectorTemplate { split_id: SplitId, - score_extractor: SortingFieldExtractorPair, - leaf_max_hits: usize, - segment_ord: u32, - search_after_option: Option, - order1: SortOrder, - order2: SortOrder, -) -> Box { - // TODO: Add support for search_after to the specialized collector. - // Eventually we may want to remove the generic collector to reduce complexity. - if search_after_option.is_some() || score_extractor.is_score() { - return Box::new(GenericQuickwitSegmentTopKCollector::new( - split_id, - score_extractor, - leaf_max_hits, - segment_ord, - search_after_option, - order1, - order2, - )); - } - - let sort_first_by_ff = score_extractor.first.is_fast_field(); - let sort_second_by_ff = score_extractor - .second - .as_ref() - .map(|extr| extr.is_fast_field()) - .unwrap_or(false); - - #[derive(Debug)] - enum SortType { - DocId, - OneFFSort, - TwoFFSorts, - } - let sort_type = match (sort_first_by_ff, sort_second_by_ff) { - (false, false) => SortType::DocId, - (true, false) => SortType::OneFFSort, - (true, true) => SortType::TwoFFSorts, - (false, true) => panic!("Internal error: Got second sort, but no first sort"), - }; - // only check order1 for OneFFSort and DocId, as it's the only sort - // - // REVERSE_DOCID is only used for SortType::DocId and SortType::OneFFSort - match (sort_type, order1, order2) { - (SortType::DocId, SortOrder::Desc, _) => { - Box::new(SpecializedSegmentTopKCollector::<(), (), false>::new( - split_id, - score_extractor, - leaf_max_hits, - segment_ord, - )) - } - (SortType::DocId, SortOrder::Asc, _) => { - Box::new(SpecializedSegmentTopKCollector::<(), (), true>::new( - split_id, - score_extractor, - leaf_max_hits, - segment_ord, - )) - } - (SortType::OneFFSort, SortOrder::Asc, SortOrder::Asc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option>, - (), - true, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - (SortType::OneFFSort, SortOrder::Desc, SortOrder::Asc) => Box::new( - SpecializedSegmentTopKCollector::, (), false>::new( - split_id, - score_extractor, - leaf_max_hits, - segment_ord, - ), - ), - (SortType::OneFFSort, SortOrder::Asc, SortOrder::Desc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option>, - (), - true, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - (SortType::OneFFSort, SortOrder::Desc, SortOrder::Desc) => Box::new( - SpecializedSegmentTopKCollector::, (), false>::new( - split_id, - score_extractor, - leaf_max_hits, - segment_ord, - ), - ), - (SortType::TwoFFSorts, SortOrder::Asc, SortOrder::Asc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option>, - Option>, - true, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - (SortType::TwoFFSorts, SortOrder::Asc, SortOrder::Desc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option>, - Option, - true, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - (SortType::TwoFFSorts, SortOrder::Desc, SortOrder::Asc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option, - Option>, - false, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - (SortType::TwoFFSorts, SortOrder::Desc, SortOrder::Desc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option, - Option, - false, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - } -} - -/// Fast Top K Computation -/// -/// The buffer is truncated to the top_n elements when it reaches the capacity of the Vec. -/// That means capacity has special meaning and should be carried over when cloning or serializing. -/// -/// For TopK == 0, it will be relative expensive. -struct TopKComputer { - /// Reverses sort order to get top-semantics instead of bottom-semantics - buffer: Vec>, - top_n: usize, - pub(crate) threshold: D, -} - -// Custom clone to keep capacity -impl Clone for TopKComputer { - fn clone(&self) -> Self { - let mut buffer_clone = Vec::with_capacity(self.buffer.capacity()); - buffer_clone.extend(self.buffer.iter().cloned()); - - TopKComputer { - buffer: buffer_clone, - top_n: self.top_n, - threshold: self.threshold.clone(), - } - } -} - -impl TopKComputer -where D: Ord + Copy + Debug + MinValue -{ - /// Create a new `TopKComputer`. - pub fn new(top_n: usize) -> Self { - // Vec cap can't be 0, since it would panic in push - let vec_cap = top_n.max(1) * 10; - TopKComputer { - buffer: Vec::with_capacity(vec_cap), - top_n, - threshold: D::min_value(), - } + // We track the segment ordinal here, but splits only have 1 segment so this + // should always be 0. + segment_ord: SegmentOrdinal, + hit_fetcher: SortingFieldExtractorPair, + top_k_hits: TopKComputer>, + search_after_opt: Option>, +} + +impl QuickwitSegmentTopKCollectorTemplate { + pub(crate) fn collect_top_k_block(&mut self, docs: &[DocId]) { + let search_after_opt = self.search_after_opt; + let top_k_hits = &mut self.top_k_hits; + self.hit_fetcher + .project_to_internal_sort_value_block(docs, |repr| { + if let Some(search_after) = search_after_opt + && repr.cmp(&search_after) != Ordering::Less + { + return; + } + top_k_hits.push(repr); + }); } - /// Push a new document to the top n. - /// If the document is below the current threshold, it will be ignored. - #[inline] - pub fn push(&mut self, doc: D) { - if doc < self.threshold { + pub(crate) fn collect_top_k(&mut self, doc_id: DocId, score: Score) { + let internal_repr = self + .hit_fetcher + .project_to_internal_sort_value(doc_id, score); + if let Some(search_after) = self.search_after_opt + && internal_repr.cmp(&search_after) != Ordering::Less + { return; } - if self.buffer.len() == self.buffer.capacity() { - let median = self.truncate_top_n(); - self.threshold = median; - } - - // This is faster since it avoids the buffer resizing to be inlined from vec.push() - // (this is in the hot path) - // TODO: Replace with `push_within_capacity` when it's stabilized - let uninit = self.buffer.spare_capacity_mut(); - // This cannot panic, because we truncate_median will at least remove one element, since - // the min capacity is larger than 2. - uninit[0].write(Reverse(doc)); - // This is safe because it would panic in the line above - unsafe { - self.buffer.set_len(self.buffer.len() + 1); - } + self.top_k_hits.push(internal_repr); } - #[inline(never)] - fn truncate_top_n(&mut self) -> D { - // Use select_nth_unstable to find the top nth score - let (_, median_el, _) = self.buffer.select_nth_unstable(self.top_n); - - let median_score = *median_el; - // Remove all elements below the top_n - self.buffer.truncate(self.top_n); - - median_score.0 - } - - /// Returns the top n elements in sorted order. - pub fn into_sorted_vec(mut self) -> Vec { - if self.buffer.len() > self.top_n { - self.truncate_top_n(); - } - self.buffer.sort_unstable(); - self.buffer.into_iter().map(|el| el.0).collect() - } - - /// Returns the top n elements in stored order. - /// Useful if you do not need the elements in sorted order, - /// for example when merging the results of multiple segments. - #[allow(dead_code)] - pub fn into_vec(mut self) -> Vec { - if self.buffer.len() > self.top_n { - self.truncate_top_n(); - } - self.buffer.into_iter().map(|el| el.0).collect() - } -} - -pub use tantivy::COLLECT_BLOCK_BUFFER_LEN; -struct SpecSortingFieldExtractor { - _phantom: std::marker::PhantomData<(V1, V2)>, - sort_values1: Box<[Option; COLLECT_BLOCK_BUFFER_LEN]>, - sort_values2: Box<[Option; COLLECT_BLOCK_BUFFER_LEN]>, - - pub first: SortingFieldExtractorComponent, - pub second: Option, -} - -impl< - V1: Copy + PartialEq + PartialOrd + Ord + IntoOptionU64 + Debug, - V2: Copy + PartialEq + PartialOrd + Ord + IntoOptionU64 + Debug, -> SpecSortingFieldExtractor -{ - fn new( - first: SortingFieldExtractorComponent, - second: Option, - ) -> Self { - Self { - _phantom: PhantomData, - sort_values1: vec![None; COLLECT_BLOCK_BUFFER_LEN] - .into_boxed_slice() - .try_into() - .unwrap(), - sort_values2: vec![None; COLLECT_BLOCK_BUFFER_LEN] - .into_boxed_slice() - .try_into() - .unwrap(), - first, - second, - } - } - /// Fetches the sort values for the given docs. - /// Does noting when sorting by docid. - fn fetch_data(&mut self, docs: &[DocId]) { - self.first - .extract_typed_sort_values_block(docs, &mut self.sort_values1[..docs.len()]); - if let Some(second) = self.second.as_ref() { - second.extract_typed_sort_values_block(docs, &mut self.sort_values2[..docs.len()]); - } - } - #[inline] - fn iter_hits<'a, const REVERSE_DOCID: bool>( - &'a self, - docs: &'a [DocId], - ) -> impl Iterator> + 'a { - SpecSortingFieldIter::::new( - docs, - &self.sort_values1, - &self.sort_values2, - ) - } -} - -struct SpecSortingFieldIter<'a, V1, V2, const REVERSE_DOCID: bool> { - docs: std::slice::Iter<'a, DocId>, - sort_values1: std::slice::Iter<'a, Option>, - sort_values2: std::slice::Iter<'a, Option>, - _phantom: PhantomData<(V1, V2)>, -} - -impl<'a, V1, V2, const REVERSE_DOCID: bool> SpecSortingFieldIter<'a, V1, V2, REVERSE_DOCID> -where - V1: Copy + PartialEq + PartialOrd + Ord + IntoOptionU64, - V2: Copy + PartialEq + PartialOrd + Ord + IntoOptionU64, -{ - #[inline] - pub fn new( - docs: &'a [DocId], - sort_values1: &'a [Option; COLLECT_BLOCK_BUFFER_LEN], - sort_values2: &'a [Option; COLLECT_BLOCK_BUFFER_LEN], - ) -> Self { - Self { - docs: docs.iter(), - sort_values1: sort_values1.iter(), - sort_values2: sort_values2.iter(), - _phantom: PhantomData, - } - } -} - -impl Iterator for SpecSortingFieldIter<'_, V1, V2, REVERSE_DOCID> -where - V1: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug, -{ - type Item = Hit; - - #[inline] - fn next(&mut self) -> Option { - let doc_id = *self.docs.next()?; - - let value1 = if !V1::is_unit_type() { - V1::from_option_u64(*self.sort_values1.next()?) - } else { - V1::from_option_u64(None) - }; - - let value2 = if !V2::is_unit_type() { - V2::from_option_u64(*self.sort_values2.next()?) - } else { - V2::from_option_u64(None) - }; - - Some(Hit { - doc_id, - value1, - value2, - }) - } -} - -/// No search after handling -/// Quickwit collector working at the scale of the segment. -struct SpecializedSegmentTopKCollector< - V1: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - const REVERSE_DOCID: bool, -> { - split_id: SplitId, - hit_fetcher: SpecSortingFieldExtractor, - top_k_hits: TopKComputer>, - segment_ord: u32, -} - -impl< - V1: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue + 'static, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue + 'static, - const REVERSE_DOCID: bool, -> SpecializedSegmentTopKCollector -{ - pub fn new( - split_id: SplitId, - score_extractor: SortingFieldExtractorPair, - leaf_max_hits: usize, - segment_ord: u32, - ) -> Self { - let hit_fetcher = - SpecSortingFieldExtractor::new(score_extractor.first, score_extractor.second); - let top_k_hits = TopKComputer::new(leaf_max_hits); - Self { - split_id, - hit_fetcher, - top_k_hits, - segment_ord, - } - } -} -impl< - V1: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - const REVERSE_DOCID: bool, -> QuickwitSegmentTopKCollector for SpecializedSegmentTopKCollector -{ - fn collect_top_k_block(&mut self, docs: &[DocId]) { - self.hit_fetcher.fetch_data(docs); - let iter = self.hit_fetcher.iter_hits::(docs); - for doc_id in iter { - self.top_k_hits.push(doc_id); - } - } - - #[inline] - fn collect_top_k(&mut self, _doc_id: DocId, _score: Score) { - panic!("Internal Error: This collector does not support collect_top_k"); - } - - fn get_top_k(&self) -> Vec { + pub(crate) fn get_top_k(&self) -> tantivy::Result> { self.top_k_hits .clone() .into_sorted_vec() .into_iter() - .map(|el| el.into_segment_partial_hit()) - .map(|segment_partial_hit: SegmentPartialHit| { - segment_partial_hit.into_partial_hit( - self.split_id.clone(), + .map(|internal_repr| { + self.hit_fetcher.internal_to_partial_hit( + &self.split_id, self.segment_ord, - &self.hit_fetcher.first, - &self.hit_fetcher.second, + internal_repr, ) }) .collect() } } -/// Quickwit collector working at the scale of the segment. -pub(crate) struct GenericQuickwitSegmentTopKCollector { - split_id: SplitId, - score_extractor: SortingFieldExtractorPair, - // PartialHits in this heap don't contain a split_id yet. - top_k_hits: TopK, - segment_ord: u32, - search_after: Option, - // Precomputed order for search_after for split_id and segment_ord - precomp_search_after_order: Ordering, - sort_values1: Box<[Option; COLLECT_BLOCK_BUFFER_LEN]>, - sort_values2: Box<[Option; COLLECT_BLOCK_BUFFER_LEN]>, +pub enum QuickwitSegmentTopKCollector { + DocIdSort(QuickwitSegmentTopKCollectorTemplate<(), ()>), + OneDimSort(QuickwitSegmentTopKCollectorTemplate), + TwoDimSort(QuickwitSegmentTopKCollectorTemplate), + Noop, } -impl GenericQuickwitSegmentTopKCollector { - pub fn new( +impl QuickwitSegmentTopKCollector { + pub fn new_with_doc_id_sort( split_id: SplitId, - score_extractor: SortingFieldExtractorPair, - leaf_max_hits: usize, - segment_ord: u32, - search_after_option: Option, - order1: SortOrder, - order2: SortOrder, + segment_ord: SegmentOrdinal, + hit_fetcher: SortingFieldExtractorPair<(), ()>, + top_k: usize, + search_after_opt: Option>, ) -> Self { - let sort_key_mapper = HitSortingMapper { order1, order2 }; - let precomp_search_after_order = match &search_after_option { - Some(search_after) if !search_after.split_id.is_empty() => order1 - .compare(&split_id, &search_after.split_id) - .then_with(|| order1.compare(&segment_ord, &search_after.segment_ord)), - // This value isn't actually used. - _ => Ordering::Equal, - }; - let search_after = - SearchAfterSegment::new(search_after_option, order1, order2, &score_extractor); - - GenericQuickwitSegmentTopKCollector { - split_id, - score_extractor, - top_k_hits: TopK::new(leaf_max_hits, sort_key_mapper), // Adjusted for context - segment_ord, - search_after, - precomp_search_after_order, - sort_values1: vec![None; COLLECT_BLOCK_BUFFER_LEN] - .into_boxed_slice() - .try_into() - .unwrap(), - sort_values2: vec![None; COLLECT_BLOCK_BUFFER_LEN] - .into_boxed_slice() - .try_into() - .unwrap(), + if let Some(search_after) = &search_after_opt + && search_after.is_skip_all() + { + QuickwitSegmentTopKCollector::Noop + } else { + QuickwitSegmentTopKCollector::DocIdSort(QuickwitSegmentTopKCollectorTemplate { + split_id, + segment_ord, + top_k_hits: TopKComputer::new(top_k), + hit_fetcher, + search_after_opt, + }) } } - #[inline] - /// Generic top k collection, that includes search_after handling - /// - /// Outside of the collector to circumvent lifetime issues. - fn collect_top_k_vals( - doc_id: DocId, - sort_value: Option, - sort_value2: Option, - search_after: &Option, - precomp_search_after_order: Ordering, - top_k_hits: &mut TopK, - ) { - if let Some(search_after) = &search_after { - let search_after_value1 = search_after.sort_value; - let search_after_value2 = search_after.sort_value2; - let orders = &top_k_hits.sort_key_mapper; - let mut cmp_result = orders - .order1 - .compare_opt(&sort_value, &search_after_value1) - .then_with(|| { - orders - .order2 - .compare_opt(&sort_value2, &search_after_value2) - }); - if search_after.compare_on_equal { - // TODO actually it's not first, it should be what's in _shard_doc then first then - // default - let order = orders.order1; - cmp_result = cmp_result - .then(precomp_search_after_order) - // We compare doc_id only if sort_value1, sort_value2, split_id and segment_ord - // are equal. - .then_with(|| order.compare(&doc_id, &search_after.doc_id)) - } - if cmp_result != Ordering::Less { - return; - } + pub fn new_with_one_dim_sort( + split_id: SplitId, + segment_ord: SegmentOrdinal, + hit_fetcher: SortingFieldExtractorPair, + top_k: usize, + search_after_opt: Option>, + ) -> Self { + if let Some(search_after) = &search_after_opt + && search_after.is_skip_all() + { + QuickwitSegmentTopKCollector::Noop + } else { + QuickwitSegmentTopKCollector::OneDimSort(QuickwitSegmentTopKCollectorTemplate { + split_id, + segment_ord, + top_k_hits: TopKComputer::new(top_k), + hit_fetcher, + search_after_opt, + }) } - - let hit = SegmentPartialHit { - sort_value, - sort_value2, - doc_id, - }; - top_k_hits.add_entry(hit); } -} -impl QuickwitSegmentTopKCollector for GenericQuickwitSegmentTopKCollector { - fn collect_top_k_block(&mut self, docs: &[DocId]) { - self.score_extractor.extract_typed_sort_values( - docs, - &mut self.sort_values1[..], - &mut self.sort_values2[..], - ); - if self.search_after.is_some() { - // Search after not optimized for block collection yet - for ((doc_id, sort_value), sort_value2) in docs - .iter() - .cloned() - .zip(self.sort_values1.iter().cloned()) - .zip(self.sort_values2.iter().cloned()) - { - Self::collect_top_k_vals( - doc_id, - sort_value, - sort_value2, - &self.search_after, - self.precomp_search_after_order, - &mut self.top_k_hits, - ); - } + + pub fn new_with_two_dim_sort( + split_id: SplitId, + segment_ord: SegmentOrdinal, + hit_fetcher: SortingFieldExtractorPair, + top_k: usize, + search_after_opt: Option>, + ) -> Self { + if let Some(search_after) = &search_after_opt + && search_after.is_skip_all() + { + QuickwitSegmentTopKCollector::Noop } else { - // Probably would make sense to check the fence against e.g. sort_values1 earlier, - // before creating the SegmentPartialHit. - // - // Below are different versions to avoid iterating the caches if they are unused. - // - // No sort values loaded. Sort only by doc_id. - if !self.score_extractor.first.is_fast_field() { - for doc_id in docs.iter().cloned() { - let hit = SegmentPartialHit { - sort_value: None, - sort_value2: None, - doc_id, - }; - self.top_k_hits.add_entry(hit); - } - return; + QuickwitSegmentTopKCollector::TwoDimSort(QuickwitSegmentTopKCollectorTemplate { + split_id, + segment_ord, + top_k_hits: TopKComputer::new(top_k), + hit_fetcher, + search_after_opt, + }) + } + } + + pub(crate) fn collect_top_k_block(&mut self, docs: &[DocId]) { + match self { + QuickwitSegmentTopKCollector::DocIdSort(collector) => { + collector.collect_top_k_block(docs) } - let has_no_second_sort = !self - .score_extractor - .second - .as_ref() - .map(|extr| extr.is_fast_field()) - .unwrap_or(false); - // No second sort values => We can skip iterating the second sort values cache. - if has_no_second_sort { - for (doc_id, sort_value) in - docs.iter().cloned().zip(self.sort_values1.iter().cloned()) - { - let hit = SegmentPartialHit { - sort_value, - sort_value2: None, - doc_id, - }; - self.top_k_hits.add_entry(hit); - } - return; + QuickwitSegmentTopKCollector::OneDimSort(collector) => { + collector.collect_top_k_block(docs) } - - for ((doc_id, sort_value), sort_value2) in docs - .iter() - .cloned() - .zip(self.sort_values1.iter().cloned()) - .zip(self.sort_values2.iter().cloned()) - { - let hit = SegmentPartialHit { - sort_value, - sort_value2, - doc_id, - }; - self.top_k_hits.add_entry(hit); + QuickwitSegmentTopKCollector::TwoDimSort(collector) => { + collector.collect_top_k_block(docs) } + QuickwitSegmentTopKCollector::Noop => {} } } - #[inline] - fn collect_top_k(&mut self, doc_id: DocId, score: Score) { - let (sort_value, sort_value2): (Option, Option) = - self.score_extractor.extract_typed_sort_value(doc_id, score); - Self::collect_top_k_vals( - doc_id, - sort_value, - sort_value2, - &self.search_after, - self.precomp_search_after_order, - &mut self.top_k_hits, - ); - } - - fn get_top_k(&self) -> Vec { - self.top_k_hits - .clone() - .finalize() - .into_iter() - .map(|segment_partial_hit: SegmentPartialHit| { - segment_partial_hit.into_partial_hit( - self.split_id.clone(), - self.segment_ord, - &self.score_extractor.first, - &self.score_extractor.second, - ) - }) - .collect() - } -} - -/// Search After, but the sort values are converted to the u64 fast field representation. -pub(crate) struct SearchAfterSegment { - sort_value: Option, - sort_value2: Option, - compare_on_equal: bool, - doc_id: DocId, -} -impl SearchAfterSegment { - pub fn new( - search_after_opt: Option, - sort_order1: SortOrder, - sort_order2: SortOrder, - score_extractor: &SortingFieldExtractorPair, - ) -> Option { - let search_after = search_after_opt?; - let mut sort_value = None; - if let Some(search_after_sort_value) = search_after - .sort_value - .and_then(|sort_value| sort_value.sort_value) - { - if let Some(new_value) = score_extractor - .first - .convert_to_u64_ff_val(search_after_sort_value, sort_order1) - { - sort_value = Some(new_value); - } else { - // Value is out of bounds, we ignore sort_value2 and disable the whole - // search_after - return None; + pub(crate) fn collect_top_k(&mut self, doc_id: DocId, score: Score) { + match self { + QuickwitSegmentTopKCollector::DocIdSort(collector) => { + collector.collect_top_k(doc_id, score) } - } - let mut sort_value2 = None; - if let Some(search_after_sort_value) = search_after - .sort_value2 - .and_then(|sort_value2| sort_value2.sort_value) - { - let extractor = score_extractor - .second - .as_ref() - .expect("Internal error: Got sort_value2, but no sort extractor"); - if let Some(new_value) = - extractor.convert_to_u64_ff_val(search_after_sort_value, sort_order2) - { - sort_value2 = Some(new_value); + QuickwitSegmentTopKCollector::OneDimSort(collector) => { + collector.collect_top_k(doc_id, score) + } + QuickwitSegmentTopKCollector::TwoDimSort(collector) => { + collector.collect_top_k(doc_id, score) } + QuickwitSegmentTopKCollector::Noop => {} + } + } + + pub(crate) fn get_top_k(&self) -> tantivy::Result> { + match self { + QuickwitSegmentTopKCollector::DocIdSort(collector) => collector.get_top_k(), + QuickwitSegmentTopKCollector::OneDimSort(collector) => collector.get_top_k(), + QuickwitSegmentTopKCollector::TwoDimSort(collector) => collector.get_top_k(), + QuickwitSegmentTopKCollector::Noop => Ok(vec![]), } - Some(Self { - sort_value, - sort_value2, - compare_on_equal: !search_after.split_id.is_empty(), - doc_id: search_after.doc_id, - }) } } diff --git a/quickwit/quickwit-search/src/top_k_computer.rs b/quickwit/quickwit-search/src/top_k_computer.rs new file mode 100644 index 00000000000..8f6ff7c8d07 --- /dev/null +++ b/quickwit/quickwit-search/src/top_k_computer.rs @@ -0,0 +1,111 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp::Reverse; +use std::fmt::Debug; + +pub(crate) trait MinValue { + fn min_value() -> Self; +} + +/// Fast Top K Computation +/// +/// The buffer is truncated to the top_n elements when it reaches the capacity of the Vec. +/// That means capacity has special meaning and should be carried over when cloning or serializing. +/// +/// For TopK == 0, it will be relative expensive. +pub(crate) struct TopKComputer { + /// Reverses sort order to get top-semantics instead of bottom-semantics + buffer: Vec>, + top_n: usize, + pub(crate) threshold: D, +} + +// Custom clone to keep capacity +impl Clone for TopKComputer { + fn clone(&self) -> Self { + let mut buffer_clone = Vec::with_capacity(self.buffer.capacity()); + buffer_clone.extend(self.buffer.iter().cloned()); + + TopKComputer { + buffer: buffer_clone, + top_n: self.top_n, + threshold: self.threshold.clone(), + } + } +} + +impl TopKComputer +where D: Ord + Copy + Debug + MinValue +{ + /// Create a new `TopKComputer`. + pub fn new(top_n: usize) -> Self { + let vec_cap = top_n.max(1) * 10; + TopKComputer { + buffer: Vec::with_capacity(vec_cap), + top_n, + threshold: D::min_value(), + } + } +} + +impl TopKComputer +where D: Ord + Copy + Debug +{ + /// Push a new document to the top n. + /// If the document is below the current threshold, it will be ignored. + #[inline] + pub fn push(&mut self, doc: D) { + if doc < self.threshold { + return; + } + if self.buffer.len() == self.buffer.capacity() { + let median = self.truncate_top_n(); + self.threshold = median; + } + + // This is faster since it avoids the buffer resizing to be inlined from vec.push() + // (this is in the hot path) + // TODO: Replace with `push_within_capacity` when it's stabilized + let uninit = self.buffer.spare_capacity_mut(); + // This cannot panic, because truncate_top_n will at least remove one element, since + // the min capacity is larger than 2. + uninit[0].write(Reverse(doc)); + // This is safe because it would panic in the line above + unsafe { + self.buffer.set_len(self.buffer.len() + 1); + } + } + + #[inline(never)] + fn truncate_top_n(&mut self) -> D { + // Use select_nth_unstable to find the top nth score + let (_, median_el, _) = self.buffer.select_nth_unstable(self.top_n); + + let median_score = *median_el; + // Remove all elements below the top_n + self.buffer.truncate(self.top_n); + + median_score.0 + } + + /// Returns the top n elements in sorted order. + pub fn into_sorted_vec(mut self) -> Vec { + if self.buffer.len() > self.top_n { + self.truncate_top_n(); + } + self.buffer.sort_unstable(); + self.buffer.into_iter().map(|el| el.0).collect() + } +} diff --git a/quickwit/quickwit-serve/Cargo.toml b/quickwit/quickwit-serve/Cargo.toml index 363065a3403..a30df6519bf 100644 --- a/quickwit/quickwit-serve/Cargo.toml +++ b/quickwit/quickwit-serve/Cargo.toml @@ -31,6 +31,7 @@ itertools = { workspace = true } mime_guess = { workspace = true } once_cell = { workspace = true } percent-encoding = { workspace = true } +pin-project = { workspace = true } pprof = { workspace = true, optional = true } prost = { workspace = true } prost-types = { workspace = true } diff --git a/quickwit/quickwit-serve/src/cluster_api/rest_handler.rs b/quickwit/quickwit-serve/src/cluster_api/rest_handler.rs index f38ddac1627..071122f799d 100644 --- a/quickwit/quickwit-serve/src/cluster_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/cluster_api/rest_handler.rs @@ -15,6 +15,11 @@ use std::convert::Infallible; use quickwit_cluster::{Cluster, ClusterSnapshot, NodeIdSchema}; +use quickwit_proto::control_plane::{ + ControlPlaneError, ControlPlaneService, ControlPlaneServiceClient, + DisableMaintenanceModeRequest, EnableMaintenanceModeRequest, EnableMaintenanceModeResponse, + GetMaintenanceModeRequest, GetMaintenanceModeResponse, +}; use warp::{Filter, Rejection}; use crate::format::extract_format_from_qs; @@ -23,22 +28,39 @@ use crate::rest_api_response::into_rest_api_response; #[derive(utoipa::OpenApi)] #[openapi( - paths(get_cluster), - components(schemas(ClusterSnapshot, NodeIdSchema,)) + paths( + get_cluster, + get_maintenance_endpoint, + enable_maintenance_endpoint, + disable_maintenance_endpoint + ), + components(schemas( + ClusterSnapshot, + NodeIdSchema, + GetMaintenanceModeResponse, + EnableMaintenanceModeResponse + )) )] pub struct ClusterApi; /// Cluster handler. pub fn cluster_handler( cluster: Cluster, + control_plane_client: ControlPlaneServiceClient, ) -> impl Filter + Clone { - warp::path!("cluster") + let cluster_info_handler = warp::path!("cluster") .and(warp::path::end()) .and(warp::get()) .and(warp::path::end().map(move || cluster.clone())) .then(get_cluster) .and(extract_format_from_qs()) .map(into_rest_api_response) + .boxed(); + + let maintenance_routes = maintenance_handler(control_plane_client); + + cluster_info_handler + .or(maintenance_routes) .recover(recover_fn) .boxed() } @@ -57,3 +79,100 @@ async fn get_cluster(cluster: Cluster) -> Result { let snapshot = cluster.snapshot().await; Ok(snapshot) } + +#[utoipa::path( + get, + tag = "Cluster Info", + path = "/cluster/maintenance", + responses( + (status = 200, description = "Successfully fetched maintenance mode status.", body = GetMaintenanceModeResponse) + ) +)] +async fn get_maintenance_endpoint( + control_plane_client: ControlPlaneServiceClient, +) -> Result { + control_plane_client + .get_maintenance_mode(GetMaintenanceModeRequest {}) + .await +} + +#[utoipa::path( + put, + tag = "Cluster Info", + path = "/cluster/maintenance", + responses( + (status = 200, description = "Successfully enabled maintenance mode.", body = EnableMaintenanceModeResponse) + ) +)] +async fn enable_maintenance_endpoint( + control_plane_client: ControlPlaneServiceClient, +) -> Result { + control_plane_client + .enable_maintenance_mode(EnableMaintenanceModeRequest {}) + .await +} + +#[utoipa::path( + delete, + tag = "Cluster Info", + path = "/cluster/maintenance", + responses( + (status = 200, description = "Successfully disabled maintenance mode.") + ) +)] +async fn disable_maintenance_endpoint( + control_plane_client: ControlPlaneServiceClient, +) -> Result<(), ControlPlaneError> { + control_plane_client + .disable_maintenance_mode(DisableMaintenanceModeRequest {}) + .await?; + Ok(()) +} + +fn maintenance_get_filter() -> impl Filter + Clone { + warp::path!("cluster" / "maintenance").and(warp::get()) +} + +fn maintenance_put_filter() -> impl Filter + Clone { + warp::path!("cluster" / "maintenance").and(warp::put()) +} + +fn maintenance_delete_filter() -> impl Filter + Clone { + warp::path!("cluster" / "maintenance").and(warp::delete()) +} + +/// Maintenance mode endpoints handler. +/// +/// - `GET /api/v1/cluster/maintenance` — get maintenance status +/// - `PUT /api/v1/cluster/maintenance` — enable maintenance mode +/// - `DELETE /api/v1/cluster/maintenance` — disable maintenance mode +fn maintenance_handler( + control_plane_client: ControlPlaneServiceClient, +) -> impl Filter + Clone { + let get_client = control_plane_client.clone(); + let put_client = control_plane_client.clone(); + let delete_client = control_plane_client; + + let get_handler = maintenance_get_filter() + .and(warp::any().map(move || get_client.clone())) + .then(get_maintenance_endpoint) + .and(extract_format_from_qs()) + .map(into_rest_api_response) + .boxed(); + + let put_handler = maintenance_put_filter() + .and(warp::any().map(move || put_client.clone())) + .then(enable_maintenance_endpoint) + .and(extract_format_from_qs()) + .map(into_rest_api_response) + .boxed(); + + let delete_handler = maintenance_delete_filter() + .and(warp::any().map(move || delete_client.clone())) + .then(disable_maintenance_endpoint) + .and(extract_format_from_qs()) + .map(into_rest_api_response) + .boxed(); + + get_handler.or(put_handler).or(delete_handler).boxed() +} diff --git a/quickwit/quickwit-serve/src/elasticsearch_api/filter.rs b/quickwit/quickwit-serve/src/elasticsearch_api/filter.rs index b8d2343f666..071f080fe81 100644 --- a/quickwit/quickwit-serve/src/elasticsearch_api/filter.rs +++ b/quickwit/quickwit-serve/src/elasticsearch_api/filter.rs @@ -14,6 +14,7 @@ use bytes::Bytes; use bytesize::ByteSize; +use http::HeaderValue; use serde::de::DeserializeOwned; use warp::reject::LengthRequired; use warp::{Filter, Rejection}; @@ -165,14 +166,21 @@ pub(crate) fn elastic_resolve_index_filter() } #[utoipa::path(get, tag = "Count", path = "/{index}/_count")] -pub(crate) fn elastic_index_count_filter() --> impl Filter, SearchQueryParamsCount, SearchBody), Error = Rejection> + Clone -{ +pub(crate) fn elastic_index_count_filter() -> impl Filter< + Extract = ( + Vec, + SearchQueryParamsCount, + SearchBody, + Option, + ), + Error = Rejection, +> + Clone { warp::path!("_elastic" / String / "_count") .and_then(extract_index_id_patterns) .and(warp::get().or(warp::post()).unify()) .and(warp::query()) .and(json_or_empty()) + .and(warp::header::optional::("user-agent")) } #[utoipa::path(delete, tag = "Indexes", path = "/{index}")] @@ -222,23 +230,33 @@ pub(crate) fn elastic_cat_indices_filter() } #[utoipa::path(get, tag = "Search", path = "/{index}/_search")] -pub(crate) fn elastic_index_search_filter() --> impl Filter, SearchQueryParams, SearchBody), Error = Rejection> + Clone { +pub(crate) fn elastic_index_search_filter() -> impl Filter< + Extract = ( + Vec, + SearchQueryParams, + SearchBody, + Option, + ), + Error = Rejection, +> + Clone { warp::path!("_elastic" / String / "_search") .and_then(extract_index_id_patterns) .and(warp::get().or(warp::post()).unify()) .and(warp::query()) .and(json_or_empty()) + .and(warp::header::optional::("user-agent")) } #[utoipa::path(post, tag = "Search", path = "/_msearch")] pub(crate) fn elastic_multi_search_filter() --> impl Filter + Clone { +-> impl Filter), Error = Rejection> + Clone +{ warp::path!("_elastic" / "_msearch") .and(warp::body::content_length_limit(BODY_LENGTH_LIMIT.as_u64())) .and(warp::body::bytes()) .and(warp::post()) .and(warp::query()) + .and(warp::header::optional::("user-agent")) } fn merge_scroll_body_params( diff --git a/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs b/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs index a9649200e5b..466f575fe51 100644 --- a/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs @@ -21,6 +21,7 @@ use bytes::Bytes; use elasticsearch_dsl::search::Hit as ElasticHit; use elasticsearch_dsl::{HitsMetadata, ShardStatistics, Source, TotalHits, TotalHitsRelation}; use futures_util::StreamExt; +use http::HeaderValue; use itertools::Itertools; use quickwit_cluster::Cluster; use quickwit_common::truncate_str; @@ -307,6 +308,7 @@ fn build_request_for_es_api( index_id_patterns: Vec, search_params: SearchQueryParams, search_body: SearchBody, + user_agent: Option, ) -> Result<(quickwit_proto::search::SearchRequest, bool), ElasticsearchError> { let default_operator = search_params.default_operator.unwrap_or(BooleanOperand::Or); // The query string, if present, takes priority over what can be in the request @@ -413,6 +415,7 @@ fn build_request_for_es_api( count_hits, ignore_missing_indexes, split_id: None, + user_agent: user_agent.and_then(|h| h.to_str().ok().map(str::to_owned)), }, has_doc_id_field, )) @@ -490,12 +493,13 @@ async fn es_compat_index_count( index_id_patterns: Vec, search_params: SearchQueryParamsCount, search_body: SearchBody, + user_agent: Option, search_service: Arc, ) -> Result { let mut search_params: SearchQueryParams = search_params.into(); search_params.track_total_hits = Some(TrackTotalHits::Track(true)); let (search_request, _append_shard_doc) = - build_request_for_es_api(index_id_patterns, search_params, search_body)?; + build_request_for_es_api(index_id_patterns, search_params, search_body, user_agent)?; let search_response: SearchResponse = search_service.root_search(search_request).await?; let search_response_rest: ElasticsearchCountResponse = ElasticsearchCountResponse { count: search_response.num_hits, @@ -507,6 +511,7 @@ async fn es_compat_index_search( index_id_patterns: Vec, search_params: SearchQueryParams, search_body: SearchBody, + user_agent: Option, search_service: Arc, ) -> Result { if search_params.scroll.is_some() && !search_params.allow_partial_search_results() { @@ -520,7 +525,7 @@ async fn es_compat_index_search( let start_instant = Instant::now(); let allow_partial_search_results = search_params.allow_partial_search_results(); let (search_request, append_shard_doc) = - build_request_for_es_api(index_id_patterns, search_params, search_body)?; + build_request_for_es_api(index_id_patterns, search_params, search_body, user_agent)?; let search_response: SearchResponse = search_service.root_search(search_request).await?; let elapsed = start_instant.elapsed(); let mut search_response_rest: ElasticsearchResponse = convert_to_es_search_response( @@ -778,16 +783,16 @@ fn convert_hit( .unwrap_or_else(|_| Source::from_string("{}".to_string()).unwrap()); let mut sort = Vec::new(); - if let Some(partial_hit) = hit.partial_hit { - if let Some(sort_value) = partial_hit.sort_value { - sort.push(sort_value.into_json()); + if let Some(partial_hit) = &hit.partial_hit { + if let Some(sort_value) = &partial_hit.sort_value { + sort.push(sort_value.clone().into_json()); } - if let Some(sort_value2) = partial_hit.sort_value2 { - sort.push(sort_value2.into_json()); + if let Some(sort_value2) = &partial_hit.sort_value2 { + sort.push(sort_value2.clone().into_json()); } if append_shard_doc { sort.push(serde_json::Value::String( - quickwit_search::GlobalDocAddress::from_partial_hit(&partial_hit).to_string(), + quickwit_search::GlobalDocAddress::from_partial_hit(partial_hit).to_string(), )); } } @@ -810,6 +815,7 @@ fn convert_hit( async fn es_compat_index_multi_search( payload: Bytes, multi_search_params: MultiSearchQueryParams, + user_agent: Option, search_service: Arc, ) -> Result { let mut search_requests = Vec::new(); @@ -864,8 +870,12 @@ async fn es_compat_index_multi_search( if let Some(extra_filters) = &multi_search_params.extra_filters { search_query_params.extra_filters = Some(extra_filters.to_vec()); } - let es_request = - build_request_for_es_api(index_ids_patterns, search_query_params, search_body)?; + let es_request = build_request_for_es_api( + index_ids_patterns, + search_query_params, + search_body, + user_agent.clone(), + )?; search_requests.push(es_request); } diff --git a/quickwit/quickwit-serve/src/grpc.rs b/quickwit/quickwit-serve/src/grpc.rs index 27d370c38aa..351341af895 100644 --- a/quickwit/quickwit-serve/src/grpc.rs +++ b/quickwit/quickwit-serve/src/grpc.rs @@ -188,10 +188,11 @@ pub(crate) async fn start_grpc_server( let search_service = services.search_service.clone(); let grpc_search_service = GrpcSearchAdapter::from(search_service); + let max_message_size_bytes = grpc_config.max_search_message_size.0 as usize; Some( SearchServiceServer::new(grpc_search_service) - .max_decoding_message_size(grpc_config.max_message_size.0 as usize) - .max_encoding_message_size(grpc_config.max_message_size.0 as usize), + .max_decoding_message_size(max_message_size_bytes) + .max_encoding_message_size(max_message_size_bytes), ) } else { None diff --git a/quickwit/quickwit-serve/src/indexing_api/mod.rs b/quickwit/quickwit-serve/src/indexing_api/mod.rs index 9d3740615a3..e9e16d79431 100644 --- a/quickwit/quickwit-serve/src/indexing_api/mod.rs +++ b/quickwit/quickwit-serve/src/indexing_api/mod.rs @@ -14,4 +14,4 @@ mod rest_handler; -pub use rest_handler::{IndexingApi, indexing_get_handler}; +pub use rest_handler::{IndexingApi, indexing_get_handler, swap_pipelines_handler}; diff --git a/quickwit/quickwit-serve/src/indexing_api/rest_handler.rs b/quickwit/quickwit-serve/src/indexing_api/rest_handler.rs index 1dcc3cd05df..4412c83c43f 100644 --- a/quickwit/quickwit-serve/src/indexing_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/indexing_api/rest_handler.rs @@ -16,6 +16,10 @@ use std::convert::Infallible; use quickwit_actors::{AskError, Mailbox, Observe}; use quickwit_indexing::actors::{IndexingService, IndexingServiceCounters}; +use quickwit_proto::control_plane::{ + ControlPlaneError, ControlPlaneService, ControlPlaneServiceClient, SwapIndexingPipelinesEntry, + SwapIndexingPipelinesRequest, SwapIndexingPipelinesResponse, SwapIndexingPipelinesResult, +}; use warp::{Filter, Rejection}; use crate::format::extract_format_from_qs; @@ -24,7 +28,15 @@ use crate::rest::recover_fn; use crate::rest_api_response::into_rest_api_response; #[derive(utoipa::OpenApi)] -#[openapi(paths(indexing_endpoint))] +#[openapi( + paths(indexing_endpoint, swap_pipelines_endpoint), + components(schemas( + SwapIndexingPipelinesRequest, + SwapIndexingPipelinesResponse, + SwapIndexingPipelinesEntry, + SwapIndexingPipelinesResult, + )) +)] pub struct IndexingApi; #[utoipa::path( @@ -59,3 +71,244 @@ pub fn indexing_get_handler( .recover(recover_fn) .boxed() } + +#[utoipa::path( + post, + tag = "Swap pipelines", + path = "/indexing/swap-pipelines", + request_body = SwapIndexingPipelinesRequest, + responses( + (status = 200, description = "Successfully swapped indexing pipelines.", body = SwapIndexingPipelinesResponse) + ) +)] +async fn swap_pipelines_endpoint( + body: SwapIndexingPipelinesRequest, + control_plane_client: ControlPlaneServiceClient, +) -> Result { + control_plane_client.swap_indexing_pipelines(body).await +} + +fn swap_pipelines_post_filter() -> impl Filter + Clone { + warp::path!("indexing" / "swap-pipelines").and(warp::post()) +} + +pub fn swap_pipelines_handler( + control_plane_client: ControlPlaneServiceClient, +) -> impl Filter + Clone { + swap_pipelines_post_filter() + .and(warp::body::json()) + .and(warp::any().map(move || control_plane_client.clone())) + .then(swap_pipelines_endpoint) + .and(extract_format_from_qs()) + .map(into_rest_api_response) + .recover(recover_fn) + .boxed() +} + +#[cfg(test)] +mod tests { + use quickwit_proto::control_plane::{ + ControlPlaneServiceClient, MockControlPlaneService, SwapIndexingPipelinesEntry, + SwapIndexingPipelinesRequest, SwapIndexingPipelinesResponse, SwapIndexingPipelinesResult, + }; + use warp::Filter; + + use super::swap_pipelines_handler; + use crate::rest::recover_fn; + + #[tokio::test] + async fn test_swap_pipelines_handler_success() { + let mut mock = MockControlPlaneService::new(); + mock.expect_swap_indexing_pipelines().returning(|request| { + let results = request + .swaps + .iter() + .map(|swap| SwapIndexingPipelinesResult { + swap: Some(swap.clone()), + success: true, + reason: String::new(), + }) + .collect(); + Ok(SwapIndexingPipelinesResponse { results }) + }); + let control_plane_client = ControlPlaneServiceClient::from_mock(mock); + + let handler = swap_pipelines_handler(control_plane_client).recover(recover_fn); + + let body = serde_json::to_vec(&SwapIndexingPipelinesRequest { + swaps: vec![SwapIndexingPipelinesEntry { + left_node_id: "indexer-1".to_string(), + left_index_id: "index-a".to_string(), + right_node_id: "indexer-2".to_string(), + right_index_id: Some("index-b".to_string()), + }], + }) + .unwrap(); + + let resp = warp::test::request() + .method("POST") + .path("/indexing/swap-pipelines") + .header("content-type", "application/json") + .body(body) + .reply(&handler) + .await; + + assert_eq!(resp.status(), 200); + + let response: SwapIndexingPipelinesResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.results.len(), 1); + assert!(response.results[0].success); + let swap = response.results[0].swap.as_ref().unwrap(); + assert_eq!(swap.left_node_id, "indexer-1"); + assert_eq!(swap.left_index_id, "index-a"); + assert_eq!(swap.right_node_id, "indexer-2"); + assert_eq!(swap.right_index_id.as_deref(), Some("index-b")); + } + + #[tokio::test] + async fn test_swap_pipelines_handler_partial_failure() { + let mut mock = MockControlPlaneService::new(); + mock.expect_swap_indexing_pipelines().returning(|request| { + let results = request + .swaps + .iter() + .enumerate() + .map(|(i, swap)| { + if i == 0 { + SwapIndexingPipelinesResult { + swap: Some(swap.clone()), + success: true, + reason: String::new(), + } + } else { + SwapIndexingPipelinesResult { + swap: Some(swap.clone()), + success: false, + reason: "pipeline count mismatch".to_string(), + } + } + }) + .collect(); + Ok(SwapIndexingPipelinesResponse { results }) + }); + let control_plane_client = ControlPlaneServiceClient::from_mock(mock); + + let handler = swap_pipelines_handler(control_plane_client).recover(recover_fn); + + let body = serde_json::to_vec(&SwapIndexingPipelinesRequest { + swaps: vec![ + SwapIndexingPipelinesEntry { + left_node_id: "indexer-1".to_string(), + left_index_id: "index-a".to_string(), + right_node_id: "indexer-2".to_string(), + right_index_id: Some("index-b".to_string()), + }, + SwapIndexingPipelinesEntry { + left_node_id: "indexer-3".to_string(), + left_index_id: "index-c".to_string(), + right_node_id: "indexer-4".to_string(), + right_index_id: Some("index-d".to_string()), + }, + ], + }) + .unwrap(); + + let resp = warp::test::request() + .method("POST") + .path("/indexing/swap-pipelines") + .header("content-type", "application/json") + .body(body) + .reply(&handler) + .await; + + assert_eq!(resp.status(), 200); + + let response: SwapIndexingPipelinesResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.results.len(), 2); + assert!(response.results[0].success); + assert!(!response.results[1].success); + assert!( + response.results[1] + .reason + .contains("pipeline count mismatch") + ); + } + + #[tokio::test] + async fn test_swap_pipelines_handler_move_without_right_index() { + let mut mock = MockControlPlaneService::new(); + mock.expect_swap_indexing_pipelines().returning(|request| { + let results = request + .swaps + .iter() + .map(|swap| SwapIndexingPipelinesResult { + swap: Some(swap.clone()), + success: true, + reason: String::new(), + }) + .collect(); + Ok(SwapIndexingPipelinesResponse { results }) + }); + let control_plane_client = ControlPlaneServiceClient::from_mock(mock); + + let handler = swap_pipelines_handler(control_plane_client).recover(recover_fn); + + // Send JSON without right_index_id field — should deserialize to None. + let body = r#"{"swaps": [{"left_node_id": "indexer-1", "left_index_id": "index-a", "right_node_id": "indexer-2"}]}"#; + + let resp = warp::test::request() + .method("POST") + .path("/indexing/swap-pipelines") + .header("content-type", "application/json") + .body(body) + .reply(&handler) + .await; + + assert_eq!(resp.status(), 200); + + let response: SwapIndexingPipelinesResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.results.len(), 1); + assert!(response.results[0].success); + let swap = response.results[0].swap.as_ref().unwrap(); + assert_eq!(swap.left_node_id, "indexer-1"); + assert_eq!(swap.left_index_id, "index-a"); + assert_eq!(swap.right_node_id, "indexer-2"); + assert!(swap.right_index_id.is_none()); + } + + #[tokio::test] + async fn test_swap_pipelines_handler_invalid_json_body() { + let mock = MockControlPlaneService::new(); + let control_plane_client = ControlPlaneServiceClient::from_mock(mock); + + let handler = swap_pipelines_handler(control_plane_client).recover(recover_fn); + + let resp = warp::test::request() + .method("POST") + .path("/indexing/swap-pipelines") + .header("content-type", "application/json") + .body(b"not json at all") + .reply(&handler) + .await; + + // Warp returns 400 for invalid JSON bodies. + assert_eq!(resp.status(), 400); + } + + #[tokio::test] + async fn test_swap_pipelines_handler_wrong_method() { + let mock = MockControlPlaneService::new(); + let control_plane_client = ControlPlaneServiceClient::from_mock(mock); + + let handler = swap_pipelines_handler(control_plane_client).recover(recover_fn); + + let resp = warp::test::request() + .method("GET") + .path("/indexing/swap-pipelines") + .reply(&handler) + .await; + + // GET on a POST-only route returns 405. + assert_eq!(resp.status(), 405); + } +} diff --git a/quickwit/quickwit-serve/src/jaeger_api/rest_handler.rs b/quickwit/quickwit-serve/src/jaeger_api/rest_handler.rs index def8a4c6ca7..5ac8dafcb04 100644 --- a/quickwit/quickwit-serve/src/jaeger_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/jaeger_api/rest_handler.rs @@ -476,7 +476,6 @@ mod tests { num_hits: 0, hits: Vec::new(), elapsed_time_micros: 0, - errors: Vec::new(), aggregation_postcard: None, scroll_id: None, failed_splits: Vec::new(), @@ -509,7 +508,6 @@ mod tests { num_hits: 0, hits: Vec::new(), elapsed_time_micros: 0, - errors: Vec::new(), aggregation_postcard: None, scroll_id: None, failed_splits: Vec::new(), diff --git a/quickwit/quickwit-serve/src/lib.rs b/quickwit/quickwit-serve/src/lib.rs index 9c7543e2e04..eaeaa8d76af 100644 --- a/quickwit/quickwit-serve/src/lib.rs +++ b/quickwit/quickwit-serve/src/lib.rs @@ -28,6 +28,7 @@ mod indexing_api; mod ingest_api; mod jaeger_api; mod load_shield; + mod metrics; mod metrics_api; mod node_info_handler; @@ -38,6 +39,7 @@ mod rest; mod rest_api_response; mod search_api; pub(crate) mod simple_list; +mod soft_delete_api; pub mod tcp_listener; mod template_api; mod ui_handler; @@ -1025,7 +1027,7 @@ async fn setup_searcher( ) .await?; let search_service_clone = search_service.clone(); - let max_message_size = node_config.grpc_config.max_message_size; + let max_message_size = node_config.grpc_config.max_search_message_size; let searcher_change_stream = cluster_change_stream.filter_map(move |cluster_change| { let search_service_clone = search_service_clone.clone(); Box::pin(async move { diff --git a/quickwit/quickwit-serve/src/metrics.rs b/quickwit/quickwit-serve/src/metrics.rs index c1e4fa24d93..333e407a8ad 100644 --- a/quickwit/quickwit-serve/src/metrics.rs +++ b/quickwit/quickwit-serve/src/metrics.rs @@ -19,8 +19,8 @@ use quickwit_common::metrics::{ }; pub struct ServeMetrics { - pub http_requests_total: IntCounterVec<2>, - pub request_duration_secs: HistogramVec<2>, + pub http_requests_total: IntCounterVec<1>, + pub request_duration_secs: HistogramVec<1>, pub ongoing_requests: IntGaugeVec<1>, pub pending_requests: IntGaugeVec<1>, pub circuit_break_total: IntCounter, @@ -40,14 +40,14 @@ impl Default for ServeMetrics { "Total number of HTTP requests processed.", "", &[], - ["method", "status_code"], + ["status_code"], ), request_duration_secs: new_histogram_vec( "request_duration_secs", "Response time in seconds", "", &[], - ["method", "status_code"], + ["status_code"], // last bucket is 163.84s quickwit_common::metrics::exponential_buckets(0.02, 2.0, 14).unwrap(), ), diff --git a/quickwit/quickwit-serve/src/otlp_api/rest_handler.rs b/quickwit/quickwit-serve/src/otlp_api/rest_handler.rs index 1654a840dad..4ec47c15847 100644 --- a/quickwit/quickwit-serve/src/otlp_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/otlp_api/rest_handler.rs @@ -25,7 +25,6 @@ use quickwit_proto::opentelemetry::proto::collector::trace::v1::{ use quickwit_proto::types::IndexId; use quickwit_proto::{ServiceError, ServiceErrorCode, tonic}; use serde::{self, Serialize}; -use tracing::error; use warp::{Filter, Rejection}; use crate::decompression::get_body_bytes; diff --git a/quickwit/quickwit-serve/src/rest.rs b/quickwit/quickwit-serve/src/rest.rs index 3f193783b04..96c0f03fefd 100644 --- a/quickwit/quickwit-serve/src/rest.rs +++ b/quickwit/quickwit-serve/src/rest.rs @@ -13,12 +13,17 @@ // limitations under the License. use std::fmt::Formatter; +use std::future::Future; use std::io; +use std::pin::Pin; use std::sync::Arc; +use std::task::{Context, Poll, ready}; +use std::time::Instant; use hyper_util::rt::{TokioExecutor, TokioIo}; use hyper_util::server::conn::auto::Builder; use hyper_util::service::TowerToHyperService; +use pin_project::{pin_project, pinned_drop}; use quickwit_common::tower::BoxFutureInfaillible; use quickwit_config::{disable_ingest_v1, enable_ingest_v2}; use quickwit_search::SearchService; @@ -26,12 +31,11 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::{TcpListener, TcpStream}; use tokio_rustls::TlsAcceptor; use tokio_util::either::Either; -use tower::ServiceBuilder; +use tower::{Layer, Service, ServiceBuilder}; use tower_http::compression::CompressionLayer; use tower_http::compression::predicate::{NotForContentType, Predicate, SizeAbove}; use tower_http::cors::{AllowOrigin, CorsLayer}; use tracing::{error, info}; -use warp::filters::log::Info; use warp::hyper::http::HeaderValue; use warp::hyper::{Method, StatusCode, http}; use warp::{Filter, Rejection, Reply, redirect}; @@ -43,7 +47,7 @@ use crate::developer_api::developer_api_routes; use crate::elasticsearch_api::elastic_api_handlers; use crate::health_check_api::health_check_handlers; use crate::index_api::index_management_handlers; -use crate::indexing_api::indexing_get_handler; +use crate::indexing_api::{indexing_get_handler, swap_pipelines_handler}; use crate::ingest_api::ingest_api_handlers; use crate::jaeger_api::jaeger_api_handlers; use crate::metrics_api::metrics_handler; @@ -53,6 +57,7 @@ use crate::rest_api_response::{RestApiError, RestApiResponse}; use crate::search_api::{ search_get_handler, search_plan_get_handler, search_plan_post_handler, search_post_handler, }; +use crate::soft_delete_api::soft_delete_api_handlers; use crate::template_api::index_template_api_handlers; use crate::ui_handler::ui_handler; use crate::{BodyFormat, BuildInfo, QuickwitServices, RuntimeInfo}; @@ -78,6 +83,111 @@ impl std::fmt::Display for TooManyRequests { } } +/// Tower layer that records HTTP request metrics for every request, including +/// cancelled ones. +#[derive(Clone)] +struct HttpMetricsLayer; + +impl Layer for HttpMetricsLayer { + type Service = HttpMetricsService; + fn layer(&self, inner: S) -> Self::Service { + HttpMetricsService { inner } + } +} + +#[derive(Clone)] +struct HttpMetricsService { + inner: S, +} + +impl Service> for HttpMetricsService +where S: Service< + http::Request, + Response = http::Response, + Error = std::convert::Infallible, + > +{ + type Response = S::Response; + type Error = S::Error; + type Future = HttpMetricsFuture; + + fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { + self.inner.poll_ready(cx) + } + + fn call(&mut self, req: http::Request) -> Self::Future { + let method = req.method().to_string(); + let path = req.uri().path().to_string(); + let user_agent = req + .headers() + .get(http::header::USER_AGENT) + .and_then(|h| h.to_str().ok()) + .unwrap_or_default() + .to_string(); + HttpMetricsFuture { + inner: self.inner.call(req), + start: Instant::now(), + method, + status: None, + path, + user_agent, + } + } +} + +#[pin_project(PinnedDrop)] +struct HttpMetricsFuture { + #[pin] + inner: F, + start: Instant, + method: String, + path: String, + user_agent: String, + /// `None` while in-flight (including if dropped before completion). + /// `Some(status)` once the response future resolves. + status: Option, +} + +#[pinned_drop] +impl PinnedDrop for HttpMetricsFuture { + fn drop(self: Pin<&mut Self>) { + let status = self.status.as_deref().unwrap_or("cancelled"); + let duration = self.start.elapsed(); + info!( + method = self.method, + path = self.path, + status = status, + elapsed_ms = duration.as_millis(), + ua = self.user_agent, + "request finished" + ); + crate::SERVE_METRICS + .http_requests_total + .with_label_values([status]) + .inc(); + crate::SERVE_METRICS + .request_duration_secs + .with_label_values([status]) + .observe(duration.as_secs_f64()); + } +} + +impl Future for HttpMetricsFuture +where F: Future, std::convert::Infallible>> +{ + type Output = F::Output; + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let this = self.project(); + let result = ready!(this.inner.poll(cx)); + *this.status = Some(match &result { + Ok(response) => response.status().as_str().to_owned(), + Err(infallible) => match *infallible {}, + }); + Poll::Ready(result) + } +} + /// Env variable key to define the minimum size above which a response should be compressed. /// If unset, no compression is applied. const QW_MINIMUM_COMPRESSION_SIZE_KEY: &str = "QW_MINIMUM_COMPRESSION_SIZE"; @@ -132,19 +242,6 @@ pub(crate) async fn start_rest_server( readiness_trigger: BoxFutureInfaillible<()>, shutdown_signal: BoxFutureInfaillible<()>, ) -> anyhow::Result<()> { - let request_counter = warp::log::custom(|info: Info| { - let elapsed = info.elapsed(); - let status = info.status(); - let label_values: [&str; 2] = [info.method().as_str(), status.as_str()]; - crate::SERVE_METRICS - .request_duration_secs - .with_label_values(label_values) - .observe(elapsed.as_secs_f64()); - crate::SERVE_METRICS - .http_requests_total - .with_label_values(label_values) - .inc(); - }); // Docs routes let api_doc = warp::path("openapi.json") .and(warp::get()) @@ -199,7 +296,6 @@ pub(crate) async fn start_rest_server( .or(health_check_routes) .or(metrics_routes) .or(developer_routes) - .with(request_counter) .recover(recover_fn_final) .with(extra_headers) .boxed(); @@ -209,6 +305,7 @@ pub(crate) async fn start_rest_server( let cors = build_cors(&quickwit_services.node_config.rest_config.cors_allow_origins); let service = ServiceBuilder::new() + .layer(HttpMetricsLayer) .layer( CompressionLayer::new() .zstd(true) @@ -303,7 +400,10 @@ fn api_v1_routes( !disable_ingest_v1(), enable_ingest_v2(), ) - .or(cluster_handler(quickwit_services.cluster.clone())) + .or(cluster_handler( + quickwit_services.cluster.clone(), + quickwit_services.control_plane_client.clone(), + )) .boxed() .or(node_info_handler( BuildInfo::get(), @@ -315,6 +415,10 @@ fn api_v1_routes( quickwit_services.indexing_service_opt.clone(), )) .boxed() + .or(swap_pipelines_handler( + quickwit_services.control_plane_client.clone(), + )) + .boxed() .or(search_routes(quickwit_services.search_service.clone())) .boxed() .or(ingest_api_handlers( @@ -339,6 +443,11 @@ fn api_v1_routes( quickwit_services.metastore_client.clone(), )) .boxed() + .or(soft_delete_api_handlers( + quickwit_services.search_service.clone(), + quickwit_services.metastore_client.clone(), + )) + .boxed() .or(jaeger_api_handlers( quickwit_services.jaeger_service_opt.clone(), )) diff --git a/quickwit/quickwit-serve/src/search_api/grpc_adapter.rs b/quickwit/quickwit-serve/src/search_api/grpc_adapter.rs index c5250ee2465..463c5ca33b5 100644 --- a/quickwit/quickwit-serve/src/search_api/grpc_adapter.rs +++ b/quickwit/quickwit-serve/src/search_api/grpc_adapter.rs @@ -15,15 +15,18 @@ use std::sync::Arc; use async_trait::async_trait; +use futures::stream::{self, StreamExt}; use quickwit_proto::error::convert_to_grpc_result; use quickwit_proto::search::{ GetKvRequest, GetKvResponse, LeafListFieldsRequest, ListFieldsRequest, ListFieldsResponse, ReportSplitsRequest, ReportSplitsResponse, search_service_server as grpc, }; -use quickwit_proto::{set_parent_span_from_request_metadata, tonic}; +use quickwit_proto::{GrpcServiceError, set_parent_span_from_request_metadata, tonic}; use quickwit_search::SearchService; use tracing::instrument; +const FETCH_DOCS_BATCH_SIZE: usize = 500; + #[derive(Clone)] pub struct GrpcSearchAdapter(Arc); @@ -68,6 +71,41 @@ impl grpc::SearchService for GrpcSearchAdapter { convert_to_grpc_result(fetch_docs_result) } + type StreamFetchDocsStream = + quickwit_proto::tonic::codegen::BoxStream; + + #[instrument(skip(self, request))] + async fn stream_fetch_docs( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + set_parent_span_from_request_metadata(request.metadata()); + let fetch_docs_request = request.into_inner(); + + // Call the regular fetch_docs method + let fetch_docs_result = self.0.fetch_docs(fetch_docs_request).await; + + let fetch_docs_response = match fetch_docs_result { + Ok(response) => response, + Err(err) => return Err(err.into_grpc_status()), + }; + + // If there is only one batch, return it directly to avoid copying to a new vec. + if fetch_docs_response.hits.len() <= FETCH_DOCS_BATCH_SIZE { + let batch = quickwit_proto::search::FetchDocsResponse { + hits: fetch_docs_response.hits, + }; + let batch_stream = stream::iter([Ok(batch)]); + return Ok(tonic::Response::new(Box::pin(batch_stream))); + } + + let batch_stream = stream::iter(fetch_docs_response.hits) + .chunks(FETCH_DOCS_BATCH_SIZE) + .map(|batch| Ok(quickwit_proto::search::FetchDocsResponse { hits: batch })); + + Ok(tonic::Response::new(Box::pin(batch_stream))) + } + #[instrument(skip(self, request))] async fn root_list_terms( &self, diff --git a/quickwit/quickwit-serve/src/search_api/rest_handler.rs b/quickwit/quickwit-serve/src/search_api/rest_handler.rs index 671d7a6c2fa..a5192eb76b5 100644 --- a/quickwit/quickwit-serve/src/search_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/search_api/rest_handler.rs @@ -15,6 +15,7 @@ use std::convert::TryFrom; use std::sync::Arc; +use http::HeaderValue; use percent_encoding::percent_decode_str; use quickwit_config::validate_index_id_pattern; use quickwit_proto::search::{CountHits, SortField, SortOrder}; @@ -246,6 +247,7 @@ mod count_hits_from_bool { pub fn search_request_from_api_request( index_id_patterns: Vec, search_request: SearchRequestQueryString, + user_agent: Option, ) -> Result { // The query ast below may still contain user input query. The actual // parsing of the user query will happen in the root service, and might require @@ -269,6 +271,7 @@ pub fn search_request_from_api_request( count_hits: search_request.count_all.into(), ignore_missing_indexes: false, split_id: search_request.split_id, + user_agent, }; Ok(search_request) } @@ -276,10 +279,12 @@ pub fn search_request_from_api_request( async fn search_endpoint( index_id_patterns: Vec, search_request: SearchRequestQueryString, + user_agent: Option, search_service: &dyn SearchService, ) -> Result { let allow_failed_splits = search_request.allow_failed_splits; - let search_request = search_request_from_api_request(index_id_patterns, search_request)?; + let search_request = + search_request_from_api_request(index_id_patterns, search_request, user_agent)?; let search_response = search_service .root_search(search_request) @@ -298,20 +303,24 @@ async fn search_endpoint( } fn search_get_filter() --> impl Filter, SearchRequestQueryString), Error = Rejection> + Clone { +-> impl Filter, SearchRequestQueryString, Option), Error = Rejection> ++ Clone { warp::path!(String / "search") .and_then(extract_index_id_patterns) .and(warp::get()) .and(warp::query()) + .and(warp::header::optional::("user-agent")) } fn search_post_filter() --> impl Filter, SearchRequestQueryString), Error = Rejection> + Clone { +-> impl Filter, SearchRequestQueryString, Option), Error = Rejection> ++ Clone { warp::path!(String / "search") .and_then(extract_index_id_patterns) .and(warp::post()) .and(warp::body::content_length_limit(1024 * 1024)) .and(warp::body::json()) + .and(warp::header::optional::("user-agent")) } fn search_plan_get_filter() @@ -334,11 +343,18 @@ fn search_plan_post_filter() async fn search( index_id_patterns: Vec, search_request: SearchRequestQueryString, + user_agent: Option, search_service: Arc, ) -> impl warp::Reply { info!(request =? search_request, "search"); let body_format = search_request.format; - let result = search_endpoint(index_id_patterns, search_request, &*search_service).await; + let result = search_endpoint( + index_id_patterns, + search_request, + user_agent.and_then(|h| h.to_str().ok().map(str::to_owned)), + &*search_service, + ) + .await; into_rest_api_response(result, body_format) } @@ -349,7 +365,8 @@ async fn search_plan( ) -> impl warp::Reply { let body_format = search_request.format; let result: Result = async { - let plan_request = search_request_from_api_request(index_id_patterns, search_request)?; + let plan_request = + search_request_from_api_request(index_id_patterns, search_request, None)?; let plan_response = search_service.search_plan(plan_request).await?; let response = serde_json::from_str(&plan_response.result)?; Ok(response) @@ -503,7 +520,6 @@ mod tests { hits: Vec::new(), snippets: None, elapsed_time_micros: 0u64, - errors: Vec::new(), aggregations: None, }; let search_response_json: JsonValue = serde_json::to_value(search_response)?; @@ -522,7 +538,7 @@ mod tests { #[tokio::test] async fn test_rest_search_api_route_post() { let rest_search_api_filter = search_post_filter(); - let (indexes, req) = warp::test::request() + let (indexes, req, _) = warp::test::request() .method("POST") .path("/quickwit-demo-index/search") .json(&true) @@ -550,7 +566,7 @@ mod tests { #[tokio::test] async fn test_rest_search_api_route_post_multi_indexes() { let rest_search_api_filter = search_post_filter(); - let (indexes, req) = warp::test::request() + let (indexes, req, _) = warp::test::request() .method("POST") .path("/quickwit-demo-index,quickwit-demo,quickwit-demo-index-*/search") .json(&true) @@ -605,7 +621,7 @@ mod tests { #[tokio::test] async fn test_rest_search_api_route_simple() { let rest_search_api_filter = search_get_filter(); - let (indexes, req) = warp::test::request() + let (indexes, req, _) = warp::test::request() .path( "/quickwit-demo-index/search?query=*&end_timestamp=1450720000&max_hits=10&\ start_offset=22", @@ -633,7 +649,7 @@ mod tests { #[tokio::test] async fn test_rest_search_api_route_count_all() { let rest_search_api_filter = search_get_filter(); - let (indexes, req) = warp::test::request() + let (indexes, req, _) = warp::test::request() .path("/quickwit-demo-index/search?query=*&count_all=true") .filter(&rest_search_api_filter) .await @@ -651,7 +667,7 @@ mod tests { } ); let rest_search_api_filter = search_get_filter(); - let (indexes, req) = warp::test::request() + let (indexes, req, _) = warp::test::request() .path("/quickwit-demo-index/search?query=*&count_all=false") .filter(&rest_search_api_filter) .await @@ -673,7 +689,7 @@ mod tests { #[tokio::test] async fn test_rest_search_api_route_simple_default_num_hits_default_offset() { let rest_search_api_filter = search_get_filter(); - let (indexes, req) = warp::test::request() + let (indexes, req, _) = warp::test::request() .path( "/quickwit-demo-index/search?query=*&end_timestamp=1450720000&search_field=title,\ body", @@ -701,7 +717,7 @@ mod tests { #[tokio::test] async fn test_rest_search_api_route_simple_format() { let rest_search_api_filter = search_get_filter(); - let (indexes, req) = warp::test::request() + let (indexes, req, _) = warp::test::request() .path("/quickwit-demo-index/search?query=*&format=json") .filter(&rest_search_api_filter) .await @@ -826,7 +842,7 @@ mod tests { "/quickwit-demo-index/search?query=*&format=json&sort_by={sort_by_query_param}" ); let rest_search_api_filter = search_get_filter(); - let (_, req) = warp::test::request() + let (_, req, _) = warp::test::request() .path(&path) .filter(&rest_search_api_filter) .await @@ -840,7 +856,7 @@ mod tests { } let rest_search_api_filter = search_get_filter(); - let (_, req) = warp::test::request() + let (_, req, _) = warp::test::request() .path("/quickwit-demo-index/search?query=*&format=json&sort_by_field=fiel1") .filter(&rest_search_api_filter) .await @@ -897,7 +913,6 @@ mod tests { hits: Vec::new(), num_hits: 10, elapsed_time_micros: 16, - errors: Vec::new(), ..Default::default() }) }); @@ -1008,7 +1023,6 @@ mod tests { }], num_hits: 1, elapsed_time_micros: 16, - errors: Vec::new(), ..Default::default() }) }); @@ -1028,7 +1042,6 @@ mod tests { "hits": [{"title": "foo", "body": "foo bar baz"}], "snippets": [{"title": [], "body": ["foo bar baz"]}], "elapsed_time_micros": 16, - "errors": [], }); assert_json_eq!(resp_json, expected_response_json); Ok(()) diff --git a/quickwit/quickwit-serve/src/soft_delete_api/handler.rs b/quickwit/quickwit-serve/src/soft_delete_api/handler.rs new file mode 100644 index 00000000000..b7000237573 --- /dev/null +++ b/quickwit/quickwit-serve/src/soft_delete_api/handler.rs @@ -0,0 +1,373 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use itertools::Itertools; +use quickwit_metastore::IndexMetadataResponseExt; +use quickwit_proto::metastore::{ + IndexMetadataRequest, MetastoreService, MetastoreServiceClient, SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, SplitDocIds, +}; +use quickwit_proto::search::SearchRequest; +use quickwit_proto::types::IndexId; +use quickwit_query::query_ast::query_ast_from_user_text; +use quickwit_search::{SearchError, SearchService}; +use serde::{Deserialize, Serialize}; +use warp::{Filter, Rejection}; + +use crate::format::extract_format_from_qs; +use crate::rest::recover_fn; +use crate::rest_api_response::into_rest_api_response; +use crate::with_arg; + +const MAX_SOFT_DELETED_HITS: u64 = 100; + +#[allow(dead_code)] +#[derive(utoipa::OpenApi)] +#[openapi( + paths(post_soft_delete), + components(schemas(SoftDeleteRequest, SoftDeleteResponse)) +)] +pub struct SoftDeleteApi; + +/// Request body for the soft-delete endpoint. +#[derive(Deserialize, Debug, PartialEq, Eq, Default, utoipa::ToSchema)] +#[serde(deny_unknown_fields)] +pub struct SoftDeleteRequest { + /// Query text in Tantivy query language to match events to soft-delete. + pub query: String, + /// Maximum number of events to soft-delete in a single call (default: 100). + #[serde(default = "default_max_soft_deletes")] + pub max_hits: u64, + /// If set, restrict soft-delete to documents with a `timestamp >= start_timestamp`. + pub start_timestamp: Option, + /// If set, restrict soft-delete to documents with a `timestamp < end_timestamp`. + pub end_timestamp: Option, +} + +fn default_max_soft_deletes() -> u64 { + MAX_SOFT_DELETED_HITS +} + +/// Response from the soft-delete endpoint. +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, utoipa::ToSchema)] +pub struct SoftDeleteResponse { + /// Total number of doc_ids that were newly soft-deleted across all splits. + pub num_soft_deleted_doc_ids: u64, +} + +/// Top-level filter combining all soft-delete API handlers. +pub fn soft_delete_api_handlers( + search_service: Arc, + metastore: MetastoreServiceClient, +) -> impl Filter + Clone { + post_soft_delete_handler(search_service, metastore.clone()) + .recover(recover_fn) + .boxed() +} + +fn post_soft_delete_handler( + search_service: Arc, + metastore: MetastoreServiceClient, +) -> impl Filter + Clone { + warp::path!(String / "soft-delete") + .and(warp::body::json()) + .and(warp::post()) + .and(with_arg(search_service)) + .and(with_arg(metastore)) + .then(post_soft_delete) + .and(extract_format_from_qs()) + .map(into_rest_api_response) +} + +#[utoipa::path( + post, + tag = "Soft Delete", + path = "/{index_id}/soft-delete", + request_body = SoftDeleteRequest, + responses( + (status = 200, description = "Successfully soft-deleted documents.", body = SoftDeleteResponse) + ), + params( + ("index_id" = String, Path, description = "The index ID to soft-delete documents from."), + ) +)] +/// Soft Delete Documents +/// +/// Runs a search query to identify matching documents, then records their internal +/// doc IDs in the metastore so they are excluded from future search results. +pub async fn post_soft_delete( + index_id: IndexId, + request: SoftDeleteRequest, + search_service: Arc, + metastore: MetastoreServiceClient, +) -> Result { + // 1. Build a SearchRequest from the soft-delete query. + // Validate the query and make sure it doesn't require default search fields + let query_ast = query_ast_from_user_text(&request.query, None); + query_ast.clone().parse_user_query(&[])?; + let query_ast_json = serde_json::to_string(&query_ast) + .map_err(|err| SearchError::Internal(format!("failed to serialize query AST: {err}")))?; + + // Enforce a hits limit that guarantee we won't delete + // more than MAX_SOFT_DELETED_HITS per split + let max_hits = if request.max_hits > MAX_SOFT_DELETED_HITS { + MAX_SOFT_DELETED_HITS + } else { + request.max_hits + }; + + let search_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: query_ast_json, + max_hits, + start_timestamp: request.start_timestamp, + end_timestamp: request.end_timestamp, + ..Default::default() + }; + + // 2. Execute root_search to get PartialHits (split_id, doc_id). + let search_response = search_service.root_search(search_request).await?; + + // 3. Group hits by split_id. + let split_doc_ids: Vec = search_response + .hits + .iter() + .filter_map(|hit| hit.partial_hit.as_ref()) + .into_group_map_by(|ph| ph.split_id.clone()) + .into_iter() + .map(|(split_id, hits)| SplitDocIds { + split_id, + doc_ids: hits.into_iter().map(|h| h.doc_id).collect(), + }) + .collect(); + + if split_doc_ids.is_empty() { + return Ok(SoftDeleteResponse { + num_soft_deleted_doc_ids: 0, + }); + } + + // 4. Resolve index_uid. + let index_metadata_request = IndexMetadataRequest::for_index_id(index_id.to_string()); + let index_uid = metastore + .index_metadata(index_metadata_request) + .await + .map_err(|err| SearchError::Internal(format!("failed to fetch index metadata: {err}")))? + .deserialize_index_metadata() + .map_err(|err| { + SearchError::Internal(format!("failed to deserialize index metadata: {err}")) + })? + .index_uid; + + // 5. Store in metastore. + let SoftDeleteDocumentsResponse { + num_soft_deleted_doc_ids, + } = metastore + .soft_delete_documents(SoftDeleteDocumentsRequest { + index_uid: Some(index_uid), + split_doc_ids, + }) + .await + .map_err(|err| SearchError::Internal(format!("failed to soft-delete documents: {err}")))?; + + Ok(SoftDeleteResponse { + num_soft_deleted_doc_ids, + }) +} + +#[cfg(test)] +mod tests { + use std::net::{Ipv4Addr, SocketAddr}; + + use quickwit_config::SearcherConfig; + use quickwit_indexing::TestSandbox; + use quickwit_search::{ClusterClient, SearchJobPlacer, SearchServiceImpl, SearcherPool}; + use warp::Filter; + + use super::*; + use crate::rest::recover_fn; + + /// Build a real `Arc` wired to the given `TestSandbox`. + async fn build_search_service(sandbox: &TestSandbox) -> Arc { + let socket_addr = SocketAddr::new(Ipv4Addr::new(127, 0, 0, 1).into(), 7280u16); + let searcher_pool = SearcherPool::default(); + let search_job_placer = SearchJobPlacer::new(searcher_pool.clone()); + let cluster_client = ClusterClient::new(search_job_placer); + let searcher_config = SearcherConfig::default(); + let searcher_context = + Arc::new(quickwit_search::SearcherContext::new(searcher_config, None)); + let search_service: Arc = Arc::new(SearchServiceImpl::new( + sandbox.metastore(), + sandbox.storage_resolver(), + cluster_client, + searcher_context, + )); + let search_service_client = + quickwit_search::SearchServiceClient::from_service(search_service.clone(), socket_addr); + searcher_pool.insert(socket_addr, search_service_client); + search_service + } + + #[tokio::test] + async fn test_soft_delete_api_post_no_matching_docs() { + let index_id = "test-soft-delete-rest"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + - name: body + type: text + mode: lenient + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "", &["title"]) + .await + .unwrap(); + let metastore = test_sandbox.metastore(); + let search_service = build_search_service(&test_sandbox).await; + let handler = soft_delete_api_handlers(search_service, metastore).recover(recover_fn); + + // POST a soft-delete query matching no docs → should get 0 + let resp = warp::test::request() + .path("/test-soft-delete-rest/soft-delete") + .method("POST") + .json(&true) + .body(r#"{"query": "title:nonexistent_term_xyz"}"#) + .reply(&handler) + .await; + assert_eq!(resp.status(), 200); + let response: SoftDeleteResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.num_soft_deleted_doc_ids, 0); + + test_sandbox.assert_quit().await; + } + + #[tokio::test] + async fn test_soft_delete_api_post_with_matching_docs() { + let index_id = "test-soft-delete-match"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + mode: lenient + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "", &["title"]) + .await + .unwrap(); + + // Ingest some documents. + let docs = vec![ + serde_json::json!({"title": "apple"}), + serde_json::json!({"title": "banana"}), + serde_json::json!({"title": "cherry"}), + ]; + test_sandbox.add_documents(docs).await.unwrap(); + + let metastore = test_sandbox.metastore(); + let search_service = build_search_service(&test_sandbox).await; + let handler = soft_delete_api_handlers(search_service, metastore).recover(recover_fn); + + // Soft-delete documents matching "apple". + let resp = warp::test::request() + .path("/test-soft-delete-match/soft-delete") + .method("POST") + .json(&true) + .body(r#"{"query": "title:apple"}"#) + .reply(&handler) + .await; + assert_eq!(resp.status(), 200); + let response: SoftDeleteResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.num_soft_deleted_doc_ids, 1); + + test_sandbox.assert_quit().await; + } + + #[tokio::test] + async fn test_soft_delete_api_post_idempotent() { + let index_id = "test-soft-delete-idempotent"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + mode: lenient + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "", &["title"]) + .await + .unwrap(); + + let docs = vec![serde_json::json!({"title": "apple"})]; + test_sandbox.add_documents(docs).await.unwrap(); + + let metastore = test_sandbox.metastore(); + let search_service = build_search_service(&test_sandbox).await; + let handler = soft_delete_api_handlers(search_service, metastore).recover(recover_fn); + + // First soft-delete. + let resp = warp::test::request() + .path("/test-soft-delete-idempotent/soft-delete") + .method("POST") + .json(&true) + .body(r#"{"query": "title:apple"}"#) + .reply(&handler) + .await; + assert_eq!(resp.status(), 200); + let response: SoftDeleteResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.num_soft_deleted_doc_ids, 1); + + // Second soft-delete of same doc — the doc is already excluded from search + // results, so the search won't find it again, yielding 0 new deletions. + let resp = warp::test::request() + .path("/test-soft-delete-idempotent/soft-delete") + .method("POST") + .json(&true) + .body(r#"{"query": "title:apple"}"#) + .reply(&handler) + .await; + assert_eq!(resp.status(), 200); + let response: SoftDeleteResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.num_soft_deleted_doc_ids, 0); + + test_sandbox.assert_quit().await; + } + + #[tokio::test] + async fn test_soft_delete_api_post_deny_unknown_fields() { + let index_id = "test-soft-delete-unknown"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + mode: lenient + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "", &["title"]) + .await + .unwrap(); + let metastore = test_sandbox.metastore(); + let search_service = build_search_service(&test_sandbox).await; + let handler = soft_delete_api_handlers(search_service, metastore).recover(recover_fn); + + // POST with unknown field should fail. + let resp = warp::test::request() + .path("/test-soft-delete-unknown/soft-delete") + .method("POST") + .json(&true) + .body(r#"{"query": "title:apple", "unknown_field": true}"#) + .reply(&handler) + .await; + assert_eq!(resp.status(), 400); + + test_sandbox.assert_quit().await; + } +} diff --git a/quickwit/quickwit-serve/src/soft_delete_api/mod.rs b/quickwit/quickwit-serve/src/soft_delete_api/mod.rs new file mode 100644 index 00000000000..d72811748f5 --- /dev/null +++ b/quickwit/quickwit-serve/src/soft_delete_api/mod.rs @@ -0,0 +1,17 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod handler; + +pub use handler::soft_delete_api_handlers; diff --git a/quickwit/quickwit-storage/src/object_storage/policy.rs b/quickwit/quickwit-storage/src/object_storage/policy.rs index 6ce48ab7a94..9f6b0ced2b6 100644 --- a/quickwit/quickwit-storage/src/object_storage/policy.rs +++ b/quickwit/quickwit-storage/src/object_storage/policy.rs @@ -67,9 +67,11 @@ impl MultiPartPolicy { impl Default for MultiPartPolicy { fn default() -> Self { MultiPartPolicy { - // S3 limits part size from 5M to 5GB, we want to end up with as few parts as possible - // since each part is charged as a put request. - target_part_num_bytes: 5_000_000_000, // 5GB + // QW originally used 5GB to limit the number of PUT requests. This + // is a bit excessive, and many cloud providers don't bill by + // request. We don't want it to be too small either because parts + // incur a performance overhead when a range request spans 2 parts. + target_part_num_bytes: 2_000_000_000, // 2GB multipart_threshold_num_bytes: 128 * 1_024 * 1_024, // 128 MiB max_num_parts: 10_000, max_object_num_bytes: 5_000_000_000_000u64, // S3 allows up to 5TB objects diff --git a/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage.rs b/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage.rs index ecce3c795da..d34eedcc5fd 100644 --- a/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage.rs +++ b/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage.rs @@ -17,10 +17,12 @@ use std::ops::Range; use std::path::{Path, PathBuf}; use std::pin::Pin; use std::task::{Context, Poll}; +use std::time::Duration; use std::{fmt, io}; use anyhow::{Context as AnyhhowContext, anyhow}; use async_trait::async_trait; +use aws_config::timeout::TimeoutConfig; use aws_credential_types::provider::SharedCredentialsProvider; use aws_sdk_s3::Client as S3Client; use aws_sdk_s3::config::{Credentials, Region}; @@ -145,7 +147,13 @@ pub async fn create_s3_client(s3_storage_config: &S3StorageConfig) -> S3Client { s3_config.set_retry_config(aws_config.retry_config().cloned()); s3_config.set_sleep_impl(aws_config.sleep_impl()); s3_config.set_stalled_stream_protection(aws_config.stalled_stream_protection()); - s3_config.set_timeout_config(aws_config.timeout_config().cloned()); + s3_config.set_timeout_config(Some( + TimeoutConfig::builder() + .connect_timeout(Duration::from_secs(5)) + .operation_attempt_timeout(Duration::from_secs(900)) // Single attempt timeout + .operation_timeout(Duration::from_secs(1800)) // Total timeout + .build(), + )); if let Some(endpoint) = s3_storage_config.endpoint() { info!(endpoint=%endpoint, "using S3 endpoint defined in storage config or environment variable"); diff --git a/quickwit/quickwit-ui/src/components/IndexSummary.tsx b/quickwit/quickwit-ui/src/components/IndexSummary.tsx index c3eca2da261..7be3b8b01ee 100644 --- a/quickwit/quickwit-ui/src/components/IndexSummary.tsx +++ b/quickwit/quickwit-ui/src/components/IndexSummary.tsx @@ -13,7 +13,7 @@ // limitations under the License. import styled from "@emotion/styled"; -import { Paper } from "@mui/material"; +import { Alert, Paper } from "@mui/material"; import dayjs from "dayjs"; import utc from "dayjs/plugin/utc"; import { FC, ReactNode } from "react"; @@ -75,6 +75,12 @@ export function IndexSummary({ index }: { index: Index }) { return ( + {index.split_limit_reached && ( + + Split limit reached. Only the first 10,000 splits were retrieved. + The actual total may be higher. Statistics shown are incomplete. + + )} {dayjs .unix(index.metadata.create_timestamp) diff --git a/quickwit/quickwit-ui/src/services/client.ts b/quickwit/quickwit-ui/src/services/client.ts index cc7643b6687..95baaceed99 100644 --- a/quickwit/quickwit-ui/src/services/client.ts +++ b/quickwit/quickwit-ui/src/services/client.ts @@ -81,7 +81,8 @@ export class Client { ]); return { metadata: metadata, - splits: splits, + splits: splits[0], + split_limit_reached: splits[1], }; } @@ -89,14 +90,16 @@ export class Client { return this.fetch(`${this.apiRoot()}indexes/${indexId}`, {}); } - async getAllSplits(indexId: string): Promise> { + async getAllSplits( + indexId: string, + ): Promise<[Array, boolean]> { // TODO: restrieve all the splits. const results: { splits: Array } = await this.fetch( `${this.apiRoot()}indexes/${indexId}/splits?limit=10000`, {}, ); - return results["splits"]; + return [results["splits"], results["splits"].length === 10000]; } async listIndexes(): Promise> { diff --git a/quickwit/quickwit-ui/src/utils/models.ts b/quickwit/quickwit-ui/src/utils/models.ts index 67e77add3de..8abe8acc6e1 100644 --- a/quickwit/quickwit-ui/src/utils/models.ts +++ b/quickwit/quickwit-ui/src/utils/models.ts @@ -282,6 +282,7 @@ export type Range = { export type Index = { metadata: IndexMetadata; splits: SplitMetadata[]; + split_limit_reached: boolean; }; export type Cluster = { diff --git a/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml b/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml index 40413bbfcec..755e8ae1db1 100644 --- a/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml +++ b/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml @@ -284,14 +284,16 @@ expected: response: values: - key: 85.0 - value: 100.49456770856702 + value: + $expect: 'abs(val - 100.4945) < 0.1' - doc_count: 2 key: 1422662400000.0 key_as_string: '2015-01-31T00:00:00Z' response: values: - key: 85.0 - value: 30.26717133872237 + value: + $expect: 'abs(val - 30.2617) < 0.1' --- # Test histogram method: [GET] @@ -353,12 +355,16 @@ json: field: "date" expected: aggregations: + # cardinality queries are currently being improved upstream unique_names: - value: 8.0 + value: + $expect: 'abs(val - 8) <= 2' unique_response: - value: 5.0 # TODO: Check. The correct number is 6 + value: + $expect: 'abs(val - 6) <= 2' unique_dates: - value: 6.0 + value: + $expect: 'abs(val - 6) <= 3' --- # Test extended stats aggregation method: [GET] diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0018-search_after.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0018-search_after.yaml index bd24f4fb718..c34cd43d64a 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0018-search_after.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0018-search_after.yaml @@ -45,24 +45,6 @@ expected: hits: - sort: [9018] --- -# Test with a search after value as string -# Quickwit should convert it to the correct type -json: - size: 1 - query: - match_all: {} - sort: - - actor.id: - order: asc - search_after: ["5688"] -expected: - hits: - total: - value: 100 - relation: eq - hits: - - sort: [9018] ---- json: size: 1 query: @@ -93,21 +75,6 @@ expected: hits: $expect: "len(val) == 4" --- -# Quickwit should accept timestamp as string. -json: - size: 100 - track_total_hits: true - query: - match_all: {} - sort: - - created_at: - order: asc - search_after: ["1422748815000"] -expected: - hits: - hits: - $expect: "len(val) == 4" ---- json: size: 100 track_total_hits: true @@ -116,7 +83,7 @@ json: sort: - created_at: order: desc - search_after: ["1422748800001"] + search_after: [1422748800001] expected: hits: hits: diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/0001_ts_range.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/0001_ts_range.yaml index dc9765b634e..7dae4d645da 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/0001_ts_range.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/0001_ts_range.yaml @@ -40,3 +40,9 @@ params: query: "auto_date:>=2023-05-25T00:00:00Z AND auto_date:<2023-05-26T00:00:00Z" expected: num_hits: 2 +--- +endpoint: millisec/search +params: + query: "ts:>=2022-12-16T10:00:57.000Z AND ts:<=2022-12-16T10:00:57.000Z" +expected: + num_hits: 1 \ No newline at end of file diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml index b333ed3c86a..e410ecd96c0 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml @@ -98,3 +98,31 @@ ndjson: - {"text_raw": "indexed with raw tokenizer dashes"} - {"text_fast": "fast-text-value-dashes"} - {"text_fast": "fast text value whitespaces"} +--- +method: DELETE +endpoint: indexes/millisec +status_code: null +--- +method: POST +endpoint: indexes/ +json: + version: "0.7" + index_id: millisec + doc_mapping: + timestamp_field: ts + mode: strict + field_mappings: + - name: ts + type: datetime + fast: true + input_formats: ["rfc3339"] + fast_precision: milliseconds +--- +method: POST +endpoint: millisec/ingest +params: + commit: force +ndjson: + - {"ts": "2022-12-16T10:00:56.297Z"} + - {"ts": "2022-12-16T10:00:57.000Z"} + - {"ts": "2022-12-16T10:00:57.297Z"} \ No newline at end of file diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml index 56cd2bda8a9..ebfa1c4931b 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml @@ -3,3 +3,6 @@ endpoint: indexes/simple --- method: DELETE endpoint: indexes/nested +--- +method: DELETE +endpoint: indexes/millisec \ No newline at end of file diff --git a/quickwit/rest-api-tests/scenarii/search_after/0001-search_after_edge_case.yaml b/quickwit/rest-api-tests/scenarii/search_after/0001-search_after_edge_case.yaml index a1e958e0e50..85f6aa999f6 100644 --- a/quickwit/rest-api-tests/scenarii/search_after/0001-search_after_edge_case.yaml +++ b/quickwit/rest-api-tests/scenarii/search_after/0001-search_after_edge_case.yaml @@ -227,9 +227,9 @@ expected: relation: eq hits: - sort: [0] - - sort: [True] - sort: [10.5] - sort: [18000000000000000000] + - sort: [True] --- desc: "search after on mixed column desc match nothing" json: @@ -263,8 +263,8 @@ expected: value: 5 relation: eq hits: - - sort: [True] - sort: [0] - sort: [-10] + diff --git a/quickwit/rust-toolchain.toml b/quickwit/rust-toolchain.toml index e54a09951e9..2a30998f14b 100644 --- a/quickwit/rust-toolchain.toml +++ b/quickwit/rust-toolchain.toml @@ -1,4 +1,4 @@ [toolchain] -channel = "1.91" +channel = "1.93" components = ["cargo", "clippy", "rustfmt", "rust-docs"]