jd-opensource · ethan686 · Apr 14, 2026 · Apr 15, 2026 · Apr 15, 2026 · May 12, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -270,6 +270,7 @@ find_package(GTest CONFIG REQUIRED)
 find_package(benchmark CONFIG REQUIRED)
 find_package(nlohmann_json CONFIG REQUIRED)
 find_package(OpenCV CONFIG REQUIRED)
+find_package(FFMPEG REQUIRED)
 find_package(Python COMPONENTS Development REQUIRED)
 find_package(pybind11 CONFIG REQUIRED)
 

diff --git a/vcpkg.json b/vcpkg.json
@@ -71,6 +71,11 @@
       "name": "snappy",
       "version>=": "1.1.10"
     },
+    {
+      "name": "ffmpeg",
+      "default-features": false,
+      "features": ["avcodec", "avformat", "swscale", "swresample", "x264"]
+    },
     {
       "name": "opencv4",
       "version>=": "4.7.0",

@@ -92,7 +92,7 @@ else()
 endif()
 
 # link brpc
-target_link_libraries(xllm PRIVATE glog::glog brpc leveldb::leveldb protobuf::libprotobuf ${OpenCV_LIBS})
+target_link_libraries(xllm PRIVATE glog::glog brpc leveldb::leveldb protobuf::libprotobuf ${OpenCV_LIBS} ${FFMPEG_LIBRARIES})
 add_dependencies(xllm brpc-static)
 
 if(USE_NPU)

@@ -15,6 +15,7 @@ cc_library(
     sample_service_impl.h
     embedding_service_impl.h
     image_generation_service_impl.h
+    video_generation_service_impl.h
     rerank_service_impl.h
     qwen3_rerank_service_impl.h
     non_stream_call.h
@@ -38,6 +39,7 @@ cc_library(
     sample_service_impl.cpp
     embedding_service_impl.cpp
     image_generation_service_impl.cpp
+    video_generation_service_impl.cpp
     models_service_impl.cpp
     rerank_service_impl.cpp
     stream_output_parser.cpp

@@ -41,6 +41,7 @@ limitations under the License.
 #include "image_generation.pb.h"
 #include "models.pb.h"
 #include "service_impl_factory.h"
+#include "video_generation.pb.h"
 #include "xllm_metrics.h"
 namespace xllm {
 
@@ -486,6 +487,52 @@ void APIService::ImageGenerationHttp(
   image_generation_service_impl_->process_async(call);
 }
 
+void APIService::VideoGeneration(::google::protobuf::RpcController* controller,
+                                 const proto::VideoGenerationRequest* request,
+                                 proto::VideoGenerationResponse* response,
+                                 ::google::protobuf::Closure* done) {
+  // TODO with xllm-service
+}
+
+void APIService::VideoGenerationHttp(
+    ::google::protobuf::RpcController* controller,
+    const proto::HttpRequest* request,
+    proto::HttpResponse* response,
+    ::google::protobuf::Closure* done) {
+  xllm::ClosureGuard done_guard(
+      done,
+      std::bind(request_in_metric, nullptr),
+      std::bind(request_out_metric, (void*)controller));
+  if (!request || !response || !controller) {
+    LOG(ERROR) << "brpc request | respose | controller is null";
+    return;
+  }
+
+  auto arena = GetArenaWithCheck<VideoGenerationCall>(response);
+  auto req_pb =
+      google::protobuf::Arena::CreateMessage<proto::VideoGenerationRequest>(
+          arena);
+  auto resp_pb =
+      google::protobuf::Arena::CreateMessage<proto::VideoGenerationResponse>(
+          arena);
+
+  auto ctrl = reinterpret_cast<brpc::Controller*>(controller);
+  std::string error;
+  json2pb::Json2PbOptions options;
+  butil::IOBuf& buf = ctrl->request_attachment();
+  butil::IOBufAsZeroCopyInputStream iobuf_stream(buf);
+  auto st = json2pb::JsonToProtoMessage(&iobuf_stream, req_pb, options, &error);
+  if (!st) {
+    ctrl->SetFailed(error);
+    LOG(ERROR) << "parse json to proto failed: " << error;
+    return;
+  }
+  std::shared_ptr<VideoGenerationCall> call =
+      std::make_shared<VideoGenerationCall>(
+          ctrl, done_guard.release(), req_pb, resp_pb, arena != nullptr);
+  video_generation_service_impl_->process_async(call);
+}
+
 void APIService::Rerank(::google::protobuf::RpcController* controller,
                         const proto::RerankRequest* request,
                         proto::RerankResponse* response,

@@ -30,6 +30,7 @@ limitations under the License.
 #include "rec_completion_service_impl.h"
 #include "rerank_service_impl.h"
 #include "sample_service_impl.h"
+#include "video_generation_service_impl.h"
 #include "xllm_service.pb.h"
 
 namespace xllm {
@@ -96,6 +97,16 @@ class APIService : public proto::XllmAPIService {
                            proto::HttpResponse* response,
                            ::google::protobuf::Closure* done) override;
 
+  void VideoGeneration(::google::protobuf::RpcController* controller,
+                       const proto::VideoGenerationRequest* request,
+                       proto::VideoGenerationResponse* response,
+                       ::google::protobuf::Closure* done) override;
+
+  void VideoGenerationHttp(::google::protobuf::RpcController* controller,
+                           const proto::HttpRequest* request,
+                           proto::HttpResponse* response,
+                           ::google::protobuf::Closure* done) override;
+
   void Rerank(::google::protobuf::RpcController* controller,
               const proto::RerankRequest* request,
               proto::RerankResponse* response,
@@ -204,6 +215,7 @@ class APIService : public proto::XllmAPIService {
   std::unique_ptr<MMEmbeddingServiceImpl> mm_embedding_service_impl_;
   std::unique_ptr<ModelsServiceImpl> models_service_impl_;
   std::unique_ptr<ImageGenerationServiceImpl> image_generation_service_impl_;
+  std::unique_ptr<VideoGenerationServiceImpl> video_generation_service_impl_;
   std::unique_ptr<RerankServiceImpl> rerank_service_impl_;
   std::unique_ptr<RecCompletionServiceImpl> rec_completion_service_impl_;
 };

@@ -49,7 +49,7 @@ bool send_result_to_client_brpc(std::shared_ptr<ImageGenerationCall> call,
     auto* proto_result = proto_output->add_results();
 
     image.clear();
-    butil::Base64Encode(output.image, &image);
+    butil::Base64Encode(output.data, &image);
 
     proto_result->set_image(image);
     proto_result->set_width(output.width);

@@ -90,6 +90,9 @@ void ServiceImplFactory::create(
          self->image_generation_service_impl_ =
              std::make_unique<ImageGenerationServiceImpl>(
                  dynamic_cast<DiTMaster*>(master), models);
+         self->video_generation_service_impl_ =
+             std::make_unique<VideoGenerationServiceImpl>(
+                 dynamic_cast<DiTMaster*>(master), models);
        }},
       {static_cast<int8_t>(ServingMode::REC),
        [](APIService* self,

@@ -0,0 +1,103 @@
+/* Copyright 2026 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "video_generation_service_impl.h"
+
+#include <butil/base64.h>
+
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "api_service/utils.h"
+#include "core/framework/request/dit_request_params.h"
+#include "distributed_runtime/dit_master.h"
+
+namespace xllm {
+
+namespace {
+
+bool send_result_to_client_brpc(std::shared_ptr<VideoGenerationCall> call,
+                                const std::string& request_id,
+                                int64_t created_time,
+                                const std::string& model,
+                                const DiTRequestOutput& req_output) {
+  auto& response = call->response();
+  response.set_object("list");
+  response.set_id(request_id);
+  response.set_created(created_time);
+  response.set_model(model);
+  auto* proto_output = response.mutable_output();
+  const std::vector<DiTGenerationOutput>& outputs = req_output.outputs;
+  proto_output->mutable_results()->Reserve(outputs.size());
+
+  std::string video;
+  for (const auto& output : outputs) {
+    auto* proto_result = proto_output->add_results();
+
+    video.clear();
+    butil::Base64Encode(output.data, &video);
+    proto_result->set_video(video);
+
+    proto_result->set_width(output.width);
+    proto_result->set_height(output.height);
+    proto_result->set_seed(output.seed);
+    proto_result->set_num_frames(output.num_frames);
+    proto_result->set_fps(output.video_fps);
+  }
+  return call->write_and_finish(response);
+}
+
+}  // namespace
+
+VideoGenerationServiceImpl::VideoGenerationServiceImpl(
+    DiTMaster* master,
+    const std::vector<std::string>& models)
+    : APIServiceImpl(models), master_{master} {
+  CHECK(master_ != nullptr);
+}
+
+void VideoGenerationServiceImpl::process_async_impl(
+    std::shared_ptr<VideoGenerationCall> call) {
+  const auto& rpc_request = call->request();
+  const auto& model = rpc_request.model();
+  if (!models_.contains(model)) {
+    call->finish_with_error(StatusCode::UNKNOWN, "Model not supported");
+    return;
+  }
+
+  DiTRequestParams request_params(
+      rpc_request, call->get_x_request_id(), call->get_x_request_time());
+
+  std::string saved_request_id = request_params.request_id;
+  master_->handle_request(
+      std::move(request_params),
+      call.get(),
+      [call,
+       model,
+       request_id = std::move(saved_request_id),
+       created_time = absl::ToUnixSeconds(absl::Now())](
+          const DiTRequestOutput& req_output) -> bool {
+        if (req_output.status.has_value()) {
+          const auto& status = req_output.status.value();
+          if (!status.ok()) {
+            return call->finish_with_error(status.code(), status.message());
+          }
+        }
+
+        return send_result_to_client_brpc(
+            call, request_id, created_time, model, req_output);
+      });
+}
+
+}  // namespace xllm
@@ -0,0 +1,42 @@
+/* Copyright 2026 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+#include <absl/container/flat_hash_set.h>
+
+#include "api_service/api_service_impl.h"
+#include "api_service/non_stream_call.h"
+#include "video_generation.pb.h"
+
+namespace xllm {
+
+using VideoGenerationCall = NonStreamCall<proto::VideoGenerationRequest,
+                                          proto::VideoGenerationResponse>;
+class DiTMaster;
+// Handles /v1/video/generation requests
+class VideoGenerationServiceImpl final
+    : public APIServiceImpl<VideoGenerationCall> {
+ public:
+  VideoGenerationServiceImpl(DiTMaster* master,
+                             const std::vector<std::string>& models);
+
+  void process_async_impl(std::shared_ptr<VideoGenerationCall> call);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(VideoGenerationServiceImpl);
+  DiTMaster* master_ = nullptr;
+};
+
+}  // namespace xllm
@@ -66,6 +66,8 @@ DiTForwardInput DiTBatch::prepare_forward_input() {
   std::vector<torch::Tensor> control_images;
   std::vector<torch::Tensor> latents;
   std::vector<torch::Tensor> masked_image_latents;
+  std::vector<torch::Tensor> last_images;
+  std::vector<torch::Tensor> image_embeds;
   const auto batch_size = request_vec_.size();
   prompt_embeds.reserve(batch_size);
   pooled_prompt_embeds.reserve(batch_size);
@@ -76,6 +78,8 @@ DiTForwardInput DiTBatch::prepare_forward_input() {
   control_images.reserve(batch_size);
   latents.reserve(batch_size);
   masked_image_latents.reserve(batch_size);
+  last_images.reserve(batch_size);
+  image_embeds.reserve(batch_size);
   for (const auto& request : request_vec_) {
     const auto& generation_params = request->state().generation_params();
     if (input.generation_params != generation_params) {
@@ -109,6 +113,8 @@ DiTForwardInput DiTBatch::prepare_forward_input() {
     mask_images.emplace_back(input_params.mask_image);
     condition_images.emplace_back(input_params.condition_image);
     control_images.emplace_back(input_params.control_image);
+    last_images.emplace_back(input_params.last_image);
+    image_embeds.emplace_back(input_params.image_embeds);
   }
 
   if (input.prompts.size() != request_vec_.size()) {
@@ -167,6 +173,14 @@ DiTForwardInput DiTBatch::prepare_forward_input() {
   if (check_tensors_valid(masked_image_latents)) {
     input.masked_image_latents = torch::stack(masked_image_latents);
   }
+
+  if (check_tensors_valid(last_images)) {
+    input.last_images = torch::stack(last_images);
+  }
+
+  if (check_tensors_valid(image_embeds)) {
+    input.image_embeds = torch::stack(image_embeds);
+  }
   return input;
 }
 

@@ -74,4 +74,5 @@ cc_library(
     proto::xllm_proto
     torch
     ${OpenCV_LIBS}
+    ${FFMPEG_LIBRARIES}
 )
@@ -52,7 +52,8 @@ void DiTRequest::log_statistic(double total_latency) {
 }
 
 void DiTRequest::handle_forward_output(torch::Tensor output) {
-  int count = state_.generation_params().num_images_per_prompt;
+  int count = state_.generation_params().num_images_per_prompt *
+              state_.generation_params().num_videos_per_prompt;
   output_.tensors = torch::chunk(output, count);
 }
 
@@ -69,12 +70,22 @@ const DiTRequestOutput DiTRequest::generate_output() {
   result.width = state_.generation_params().width;
   result.seed = state_.generation_params().seed;
 
-  OpenCVImageEncoder encoder;
-  int count = state_.generation_params().num_images_per_prompt;
+  OpenCVImageEncoder img_encoder;
+  FFmpegVideoEncoder vid_encoder;
+  int count = state_.generation_params().num_images_per_prompt *
+              state_.generation_params().num_videos_per_prompt;
   for (size_t idx = 0; idx < count; ++idx) {
-    torch::Tensor image =
+    torch::Tensor tensor =
         output_.tensors[idx].squeeze(0).cpu().to(torch::kFloat32).contiguous();
-    encoder.encode(image, result.image);
+    if (tensor.dim() == 4 || state_.generation_params().force_video_output) {
+      vid_encoder.encode(
+          tensor, state_.generation_params().video_fps, "mp4", result.data);
+      result.num_frames =
+          tensor.dim() == 4 ? static_cast<int32_t>(tensor.size(0)) : 0;
+      result.video_fps = state_.generation_params().video_fps;
+    } else {
+      img_encoder.encode(tensor, result.data);
+    }
     output.outputs.push_back(result);
   }