Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 64 additions & 61 deletions tests/core/framework/batch/batch_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -324,47 +324,48 @@ TEST(BatchTest, Basic) {
EXPECT_TRUE(equal(forward_input.positions, expected_pos));

// check the input parameters
const ModelInputParams& input_params = forward_input.input_params;
EXPECT_TRUE(input_params.meta.batch_forward_type.is_mixed());
EXPECT_EQ(input_params.meta.num_sequences, 4);
EXPECT_EQ(input_params.meta.q_max_seq_len, 9);
EXPECT_EQ(input_params.meta.kv_max_seq_len, 16);
EXPECT_EQ(input_params.embedding.embedding_ids, std::vector<int32_t>({-1, -1, -1}));
EXPECT_EQ(input_params.embedding.linear_state_ids,
EXPECT_TRUE(forward_input.meta.batch_forward_type.is_mixed());
EXPECT_EQ(forward_input.meta.num_sequences, 4);
EXPECT_EQ(forward_input.meta.q_max_seq_len, 9);
EXPECT_EQ(forward_input.meta.kv_max_seq_len, 16);
EXPECT_EQ(forward_input.embedding.embedding_ids,
std::vector<int32_t>({-1, -1, -1}));
EXPECT_EQ(forward_input.embedding.linear_state_ids,
std::vector<int32_t>({-1, -1, -1, -1}));

#if defined(USE_NPU)
const std::vector<int32_t> q_seq_lens = {9, 1, 1, 4};
#else
const std::vector<int32_t> q_seq_lens = {0, 9, 10, 11, 15};
#endif
EXPECT_TRUE(equal(input_params.attention.device.q_seq_lens, q_seq_lens));
EXPECT_TRUE(equal(forward_input.attention.device.q_seq_lens, q_seq_lens));

// seq4's kv_seq_len = q_len + num_cached_tokens (q_len<=max_allowed_tokens)
#if defined(USE_NPU)
const std::vector<int32_t> kv_seq_lens = {9, 8, 16, 8};
#else
const std::vector<int32_t> kv_seq_lens = {0, 9, 17, 33, 41};
#endif
EXPECT_TRUE(equal(input_params.attention.device.kv_seq_lens, kv_seq_lens));
EXPECT_TRUE(equal(forward_input.attention.device.kv_seq_lens, kv_seq_lens));

const std::vector<int32_t> new_cache_slots = {
/*seq1*/ 4, 5, 6, 7, 8, 9, 10, 11, 12,
/*seq2*/ 23,
/*seq3*/ 47,
/*seq4*/ 56,57,58,59
};
EXPECT_TRUE(equal(input_params.attention.device.new_cache_slots, new_cache_slots));
EXPECT_TRUE(equal(forward_input.attention.device.new_cache_slots,
new_cache_slots));

const std::vector<int32_t> block_tables = {
/*seq1*/ 1, 2, 3, 0, 0,
/*seq2*/ 4, 5, 6, 7, 0,
/*seq3*/ 8, 9, 10, 11, 12,
/*seq4*/ 13, 14, 15, 0, 0};
EXPECT_TRUE(equal(input_params.attention.device.block_tables, block_tables));
EXPECT_TRUE(equal(forward_input.attention.device.block_tables, block_tables));

// const std::vector<int32_t> last_token_idxes = {8, 9, 10};
// EXPECT_TRUE(equal(input_params.last_token_idxes, last_token_idxes));
// EXPECT_TRUE(equal(forward_input.last_token_idxes, last_token_idxes));

const auto& sampling_params = forward_input.sampling_params;
const std::vector<int64_t> unique_ids = {
Expand Down Expand Up @@ -578,7 +579,7 @@ TEST(BatchTest, ForwardInputPreservesTransferInfoAndBatchId) {
(std::vector<uint64_t>{1, 2}));
EXPECT_EQ(input.transfer_kv_infos[0].remote_blocks_ids,
(std::vector<uint64_t>{100, 101}));
EXPECT_EQ(input.input_params.meta.batch_id, batch_id);
EXPECT_EQ(input.meta.batch_id, batch_id);
}

TEST(BatchTest, ForwardInputPackedRoundTripPreservesTransportFields) {
Expand Down Expand Up @@ -650,7 +651,7 @@ TEST(BatchTest, ForwardInputPackedRoundTripPreservesTransportFields) {
ForwardInput round_trip;
reader_manager.input_read(round_trip, torch::Device(torch::kCPU));

EXPECT_EQ(round_trip.input_params.meta.batch_id, batch_id);
EXPECT_EQ(round_trip.meta.batch_id, batch_id);
EXPECT_TRUE(equal(round_trip.token_ids, std::vector<int32_t>({1, 2, 3, 4})));
ASSERT_EQ(round_trip.transfer_kv_infos.size(), 1u);
EXPECT_EQ(round_trip.transfer_kv_infos[0].local_blocks_ids,
Expand Down Expand Up @@ -709,18 +710,18 @@ TEST(BatchTest, ForwardInputBlockCopyKernelFieldsMatchExpectedLayout) {
forward_builder.build_forward_input(/*num_decoding_tokens=*/1,
/*min_decoding_batch_size=*/0);

EXPECT_TRUE(equal(forward_input.input_params.block_copy.src_block_indices,
EXPECT_TRUE(equal(forward_input.block_copy.src_block_indices,
std::vector<int32_t>({7, 8})));
EXPECT_TRUE(equal(forward_input.input_params.block_copy.dst_block_indices,
EXPECT_TRUE(equal(forward_input.block_copy.dst_block_indices,
std::vector<int32_t>({10, 11, 12})));
EXPECT_TRUE(equal(forward_input.input_params.block_copy.cum_sum,
std::vector<int32_t>({2, 3})));
EXPECT_TRUE(
equal(forward_input.block_copy.cum_sum, std::vector<int32_t>({2, 3})));

#if defined(USE_CUDA)
EXPECT_EQ(forward_input.input_params.block_copy.swap_blocks.size(),
EXPECT_EQ(forward_input.block_copy.swap_blocks.size(),
forward_swap_blocks.size());
#else
EXPECT_TRUE(forward_input.input_params.block_copy.swap_blocks.empty());
EXPECT_TRUE(forward_input.block_copy.swap_blocks.empty());
#endif

FLAGS_enable_block_copy_kernel = old_enable_block_copy_kernel;
Expand Down Expand Up @@ -780,13 +781,13 @@ TEST(BatchTest, ForwardInputCpPartitionMatchesExpectedLayout) {
equal(cp_forward_input.positions, std::vector<int32_t>({0, 1, 6, 7})));
EXPECT_TRUE(equal(cp_forward_input.sampling_params.selected_token_idxes,
std::vector<int32_t>({3})));
EXPECT_EQ(cp_forward_input.input_params.meta.q_max_seq_len, 4);
EXPECT_EQ(cp_forward_input.input_params.meta.kv_max_seq_len, 4);
EXPECT_EQ(cp_forward_input.meta.q_max_seq_len, 4);
EXPECT_EQ(cp_forward_input.meta.kv_max_seq_len, 4);

const std::vector<int32_t>& q_seq_lens =
cp_forward_input.input_params.attention.host.q_seq_lens;
cp_forward_input.attention.host.q_seq_lens;
const std::vector<int32_t>& kv_seq_lens =
cp_forward_input.input_params.attention.host.kv_seq_lens;
cp_forward_input.attention.host.kv_seq_lens;
EXPECT_TRUE((q_seq_lens == std::vector<int32_t>({4}) ||
q_seq_lens == std::vector<int32_t>({0, 4})));
EXPECT_TRUE((kv_seq_lens == std::vector<int32_t>({4}) ||
Expand Down Expand Up @@ -910,13 +911,11 @@ TEST(BatchTest, SampleRequestKeepsThreadedForwardBuilderOffsetsStable) {
expected_selected_token_idxes));
EXPECT_TRUE(
equal(forward_input.sampling_params.sample_idxes, expected_sample_idxes));
ASSERT_EQ(forward_input.input_params.embedding.embedding_ids.size(),
sequences.size());
ASSERT_EQ(forward_input.input_params.embedding.linear_state_ids.size(),
sequences.size());
EXPECT_EQ(forward_input.input_params.embedding.embedding_ids,
ASSERT_EQ(forward_input.embedding.embedding_ids.size(), sequences.size());
ASSERT_EQ(forward_input.embedding.linear_state_ids.size(), sequences.size());
EXPECT_EQ(forward_input.embedding.embedding_ids,
std::vector<int32_t>({-1, -1}));
EXPECT_EQ(forward_input.input_params.embedding.linear_state_ids,
EXPECT_EQ(forward_input.embedding.linear_state_ids,
std::vector<int32_t>({-1, -1}));
}

Expand Down Expand Up @@ -967,11 +966,10 @@ TEST(BatchTest, DecodeMinBatchSizeDoesNotPadTransportState) {
builder.build_forward_input(/*num_decoding_tokens=*/1,
/*min_decoding_batch_size=*/3);

EXPECT_EQ(forward_input.input_params.meta.num_sequences, 1);
EXPECT_EQ(forward_input.input_params.embedding.linear_state_ids,
std::vector<int32_t>({-1}));
EXPECT_EQ(forward_input.input_params.embedding.embedding_ids,
EXPECT_EQ(forward_input.meta.num_sequences, 1);
EXPECT_EQ(forward_input.embedding.linear_state_ids,
std::vector<int32_t>({-1}));
EXPECT_EQ(forward_input.embedding.embedding_ids, std::vector<int32_t>({-1}));
}

TEST(BatchTest, DecodeSingleBlockIdsStaySplitInTransportButShareSlotValue) {
Expand Down Expand Up @@ -1026,15 +1024,13 @@ TEST(BatchTest, DecodeSingleBlockIdsStaySplitInTransportButShareSlotValue) {
builder.build_forward_input(/*num_decoding_tokens=*/1,
/*min_decoding_batch_size=*/0);

ASSERT_EQ(forward_input.input_params.embedding.embedding_ids.size(), 1u);
ASSERT_EQ(forward_input.input_params.embedding.linear_state_ids.size(), 1u);
EXPECT_EQ(forward_input.input_params.embedding.embedding_ids[0],
expected_slot_id);
EXPECT_EQ(forward_input.input_params.embedding.linear_state_ids[0],
expected_slot_id);
ASSERT_EQ(forward_input.embedding.embedding_ids.size(), 1u);
ASSERT_EQ(forward_input.embedding.linear_state_ids.size(), 1u);
EXPECT_EQ(forward_input.embedding.embedding_ids[0], expected_slot_id);
EXPECT_EQ(forward_input.embedding.linear_state_ids[0], expected_slot_id);
}

TEST(BatchTest, SharedMemoryRoundTripPreservesLinearStateIds) {
TEST(BatchTest, SharedMemoryRoundTripPreservesAndDefaultsLinearStateIds) {
ForwardInput forward_input;
auto int_options = torch::TensorOptions()
.dtype(torch::kInt)
Expand All @@ -1046,30 +1042,30 @@ TEST(BatchTest, SharedMemoryRoundTripPreservesLinearStateIds) {
forward_input.positions =
torch::tensor(std::vector<int32_t>({0, 0}), int_options);
forward_input.positions_host = forward_input.positions;
forward_input.input_params.meta.batch_forward_type = BatchForwardType::DECODE;
forward_input.input_params.meta.num_sequences = 2;
forward_input.input_params.meta.kv_max_seq_len = 1;
forward_input.input_params.meta.q_max_seq_len = 1;
forward_input.input_params.attention.host.kv_seq_lens = {1, 1};
forward_input.input_params.attention.host.q_seq_lens = {1, 1};
forward_input.input_params.attention.host.q_cu_seq_lens = {1, 2};
forward_input.input_params.attention.host.kv_cache_tokens_nums = {0, 0};
forward_input.input_params.attention.host.new_cache_slots = {0, 0};
forward_input.input_params.attention.device.kv_seq_lens =
forward_input.meta.batch_forward_type = BatchForwardType::DECODE;
forward_input.meta.num_sequences = 2;
forward_input.meta.kv_max_seq_len = 1;
forward_input.meta.q_max_seq_len = 1;
forward_input.attention.host.kv_seq_lens = {1, 1};
forward_input.attention.host.q_seq_lens = {1, 1};
forward_input.attention.host.q_cu_seq_lens = {1, 2};
forward_input.attention.host.kv_cache_tokens_nums = {0, 0};
forward_input.attention.host.new_cache_slots = {0, 0};
forward_input.attention.device.kv_seq_lens =
torch::tensor(std::vector<int32_t>({1, 1}), int_options);
forward_input.input_params.attention.device.q_seq_lens =
forward_input.attention.device.q_seq_lens =
torch::tensor(std::vector<int32_t>({1, 1}), int_options);
forward_input.input_params.attention.device.q_cu_seq_lens =
forward_input.attention.device.q_cu_seq_lens =
torch::tensor(std::vector<int32_t>({1, 2}), int_options);
forward_input.input_params.attention.device.kv_cache_tokens_nums =
forward_input.attention.device.kv_cache_tokens_nums =
torch::tensor(std::vector<int32_t>({0, 0}), int_options);
forward_input.input_params.attention.device.new_cache_slots =
forward_input.attention.device.new_cache_slots =
torch::tensor(std::vector<int32_t>({0, 0}), int_options);
forward_input.input_params.attention.device.block_tables = create_2d_tensor(
forward_input.attention.device.block_tables = create_2d_tensor(
std::vector<std::vector<int32_t>>{{0}, {0}}, torch::kInt);
forward_input.input_params.attention.host.block_tables =
forward_input.input_params.attention.device.block_tables;
forward_input.input_params.embedding.linear_state_ids = {4, 6};
forward_input.attention.host.block_tables =
forward_input.attention.device.block_tables;
forward_input.embedding.linear_state_ids = {4, 6};

bool is_creator = false;
auto shm_name =
Expand All @@ -1086,8 +1082,15 @@ TEST(BatchTest, SharedMemoryRoundTripPreservesLinearStateIds) {

ForwardInput from_shm;
reader_manager.input_read(from_shm, torch::Device(torch::kCPU));
EXPECT_EQ(from_shm.input_params.embedding.linear_state_ids,
std::vector<int32_t>({4, 6}));
EXPECT_EQ(from_shm.embedding.linear_state_ids, std::vector<int32_t>({4, 6}));

forward_input.embedding.linear_state_ids.clear();
ASSERT_TRUE(writer_manager.input_write(forward_input));

ForwardInput legacy_from_shm;
reader_manager.input_read(legacy_from_shm, torch::Device(torch::kCPU));
EXPECT_EQ(legacy_from_shm.embedding.linear_state_ids,
std::vector<int32_t>({-1, -1}));
}

TEST(BatchTest, SampleRequestProcessesAllMatchedRawOutputs) {
Expand Down
10 changes: 3 additions & 7 deletions tests/core/framework/hf_model_loader_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,10 @@ class DummyRecCausalLM final : public RecCausalLM {
explicit DummyRecCausalLM(const torch::TensorOptions& options)
: options_(options) {}

ModelOutput forward(const torch::Tensor& tokens,
const torch::Tensor& positions,
std::vector<KVCache>& kv_caches,
const ModelInputParams& parameters) override {
UNUSED_PARAMETER(tokens);
UNUSED_PARAMETER(positions);
ModelOutput forward(const ForwardInput& input,
std::vector<KVCache>& kv_caches) override {
UNUSED_PARAMETER(input);
UNUSED_PARAMETER(kv_caches);
UNUSED_PARAMETER(parameters);
return ModelOutput();
}

Expand Down
Loading
Loading