Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions dlc_developer_config.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[dev]
# Set to "huggingface", for example, if you are a huggingface developer. Default is ""
partner_developer = ""
partner_developer = "huggingface"
# Please only set it to true if you are preparing an EI related PR
# Do remember to revert it back to false before merging any PR (including EI dedicated PR)
ei_mode = false
Expand Down Expand Up @@ -36,12 +36,12 @@ deep_canary_mode = false

[build]
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_sglang", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
build_frameworks = []
# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_sglang", "huggingface_llamacpp", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
build_frameworks = ["huggingface_llamacpp"]


# By default we build both training and inference containers. Set true/false values to determine which to build.
build_training = true
build_training = false
build_inference = true

# Set do_build to "false" to skip builds and test the latest image built by this PR
Expand Down Expand Up @@ -192,5 +192,8 @@ dlc-pr-huggingface-vllm = ""
# HuggingFace SGLang
dlc-pr-huggingface-sglang = ""

# Huggingface Llamacpp
dlc-pr-huggingface-llamacpp = "/huggingface/llamacpp/buildspec.yml"

# sglang
dlc-pr-sglang = ""
133 changes: 133 additions & 0 deletions huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_server.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -11,7 +11,9 @@
#include "llama.h"
#include "log.h"

+#include <algorithm>
#include <atomic>
+#include <cctype>
#include <clocale>
#include <exception>
#include <signal.h>
@@ -69,6 +71,81 @@
}
return res;
};
+}
+
+static std::string sagemaker_header(const server_http_req & req, const std::string & name) {
+ for (const auto & h : req.headers) {
+ std::string key = h.first;
+ std::transform(key.begin(), key.end(), key.begin(), [](unsigned char c) { return std::tolower(c); });
+ if (key == name) {
+ return h.second;
+ }
+ }
+ return "";
+}
+
+static std::string sagemaker_route_from_attrs(const server_http_req & req) {
+ const std::string attrs = sagemaker_header(req, "x-amzn-sagemaker-custom-attributes");
+ const std::string key = "route=";
+ const size_t pos = attrs.find(key);
+ if (pos == std::string::npos) {
+ return "";
+ }
+ const size_t start = pos + key.size();
+ const size_t end = attrs.find_first_of(",; \t\r\n", start);
+ return attrs.substr(start, end == std::string::npos ? std::string::npos : end - start);
+}
+
+static bool sagemaker_route_syntax_ok(const std::string & route) {
+ return !route.empty() && route[0] == '/' && route.find("..") == std::string::npos &&
+ route.find("://") == std::string::npos && route.find('?') == std::string::npos &&
+ route.find('#') == std::string::npos;
+}
+
+static std::string sagemaker_default_route(const server_http_req & req) {
+ const json body = json::parse(req.body, nullptr, false);
+ if (body.is_object()) {
+ if (body.contains("messages")) {
+ return "/v1/chat/completions";
+ }
+ if (body.contains("prompt")) {
+ return "/v1/completions";
+ }
+ if (body.contains("input")) {
+ return "/v1/embeddings";
+ }
+ }
+ return "/v1/chat/completions";
+}
+
+static server_http_res_ptr sagemaker_error(int status, const std::string & message) {
+ auto res = std::make_unique<server_http_res>();
+ res->status = status;
+ res->data = safe_json_to_str({
+ { "error", {
+ { "code", status },
+ { "message", message },
+ { "type", "invalid_request_error" },
+ } },
+ });
+ return res;
+}
+
+static server_http_res_ptr sagemaker_invocations(
+ const server_http_req & req,
+ const std::map<std::string, server_http_context::handler_t> & routes) {
+ const std::string requested = sagemaker_route_from_attrs(req);
+ const std::string route = requested.empty() ? sagemaker_default_route(req) : requested;
+ if (!sagemaker_route_syntax_ok(route)) {
+ return sagemaker_error(400, "invalid SageMaker route: " + route);
+ }
+ const auto it = routes.find(route);
+ if (it == routes.end()) {
+ return sagemaker_error(400, "unsupported SageMaker route: " + route);
+ }
+ server_http_req routed_req = req;
+ routed_req.path = route;
+ return it->second(routed_req);
}

int main(int argc, char ** argv) {
@@ -169,6 +246,38 @@
ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload));
}

+
+ const std::map<std::string, server_http_context::handler_t> sagemaker_routes = {
+ {"/props", routes.post_props},
+ {"/completion", routes.post_completions},
+ {"/completions", routes.post_completions},
+ {"/v1/completions", routes.post_completions_oai},
+ {"/chat/completions", routes.post_chat_completions},
+ {"/v1/chat/completions", routes.post_chat_completions},
+ {"/v1/responses", routes.post_responses_oai},
+ {"/responses", routes.post_responses_oai},
+ {"/v1/audio/transcriptions", routes.post_transcriptions_oai},
+ {"/audio/transcriptions", routes.post_transcriptions_oai},
+ {"/v1/messages", routes.post_anthropic_messages},
+ {"/v1/messages/count_tokens", routes.post_anthropic_count_tokens},
+ {"/infill", routes.post_infill},
+ {"/embedding", routes.post_embeddings},
+ {"/embeddings", routes.post_embeddings},
+ {"/v1/embeddings", routes.post_embeddings_oai},
+ {"/rerank", routes.post_rerank},
+ {"/reranking", routes.post_rerank},
+ {"/v1/rerank", routes.post_rerank},
+ {"/v1/reranking", routes.post_rerank},
+ {"/tokenize", routes.post_tokenize},
+ {"/detokenize", routes.post_detokenize},
+ {"/apply-template", routes.post_apply_template},
+ {"/lora-adapters", routes.post_lora_adapters},
+ };
+
+ ctx_http.get ("/ping", ex_wrapper(routes.get_health)); // SageMaker health endpoint
+ ctx_http.post("/invocations", ex_wrapper([&sagemaker_routes](const server_http_req & req) {
+ return sagemaker_invocations(req, sagemaker_routes);
+ }));
ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check)
ctx_http.get ("/v1/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check)
ctx_http.get ("/metrics", ex_wrapper(routes.get_metrics));
53 changes: 53 additions & 0 deletions huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/bin/bash
set -euo pipefail

# Check if telemetry file exists before executing
# Execute telemetry script if it exists, suppress errors
bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true

# Source CUDA compat for older drivers (e.g., g5 instances)
if [ -f /usr/local/bin/start_cuda_compat.sh ] \
&& command -v nvidia-smi >/dev/null 2>&1 \
&& command -v nvcc >/dev/null 2>&1; then
source /usr/local/bin/start_cuda_compat.sh
fi

# SageMaker sends traffic to port 8080 on /ping and /invocations. The custom
# llama-server build handles those routes directly.
HOST="${LLAMACPP_SAGEMAKER_HOST:-0.0.0.0}"
PORT="${SAGEMAKER_BIND_TO_PORT:-${LLAMACPP_SAGEMAKER_PORT:-8080}}"

PREFIX="SM_LLAMACPP_"
ARG_PREFIX="--"

ARGS=()

while IFS='=' read -r key value; do
arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')

ARGS+=("${ARG_PREFIX}${arg_name}")
if [ -n "$value" ]; then
ARGS+=("$value")
fi
done < <(env | grep "^${PREFIX}" || true)

# Drop any user-supplied --host / --port so SageMaker can always reach the server.
normalized=()
skip_next=0
for a in "${ARGS[@]}"; do
if [ "$skip_next" -eq 1 ]; then
skip_next=0
continue
fi
if [ "$a" = "--host" ] || [ "$a" = "--port" ]; then
skip_next=1
continue
fi
normalized+=("$a")
done
ARGS=("${normalized[@]}")
ARGS+=(--host "$HOST" --port "$PORT")

echo "[sagemaker] llama-server args: ${ARGS[*]}" >&2

exec /app/llama-server "${ARGS[@]}"
25 changes: 25 additions & 0 deletions huggingface/llamacpp/build_artifacts/start_cuda_compat.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

verlte() {
[ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
}

COMPAT_FILE=/usr/local/cuda/compat/libcuda.so.1
if [ -f $COMPAT_FILE ]; then
CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink $COMPAT_FILE | cut -d'.' -f 3-)
echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}"
NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
if [ -z "$NVIDIA_DRIVER_VERSION" ]; then
NVIDIA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>/dev/null || true)
fi
echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}"
if verlte $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then
echo "Adding CUDA compat to LD_LIBRARY_PATH"
export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
echo $LD_LIBRARY_PATH
else
echo "Skipping CUDA compat setup as newer NVIDIA driver is installed"
fi
else
echo "Skipping CUDA compat setup as package not found"
fi
78 changes: 78 additions & 0 deletions huggingface/llamacpp/buildspec.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
base_framework: &BASE_FRAMEWORK llamacpp
framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
version: &VERSION "b8882"
short_version: &SHORT_VERSION "b8882"
arch_type: &ARCH_TYPE x86_64
autopatch_build: "False"

repository_info:
build_repository: &BUILD_REPOSITORY
image_type: &IMAGE_TYPE inference
root: huggingface/llamacpp
repository_name: &REPOSITORY_NAME !join [ "pr", "-", "huggingface", "-", *BASE_FRAMEWORK ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ "huggingface", "-", *BASE_FRAMEWORK ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
build_context: &BUILD_CONTEXT
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py
start_cuda_compat:
source: build_artifacts/start_cuda_compat.sh
target: start_cuda_compat.sh
sagemaker_entrypoint:
source: build_artifacts/sagemaker_entrypoint.sh
target: sagemaker_entrypoint.sh
llamacpp_sagemaker_server_patch:
source: build_artifacts/llamacpp_sagemaker_server.patch
target: llamacpp_sagemaker_server.patch


images:
BuildHuggingFaceLlamacppGpuCu130DockerImage:
<<: *BUILD_REPOSITORY
context:
<<: *BUILD_CONTEXT
image_size_baseline: 40000
device_type: &DEVICE_TYPE gpu
cuda_version: &CUDA_VERSION cu130
os_version: &OS_VERSION ubuntu24.04
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
transformers_version: &TRANSFORMERS_VERSION 4.57.3
tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: sagemaker
build: true
enable_common_stage_build: false
test_configs:
test_platforms:
- sanity
- security
- sagemaker

BuildHuggingFaceLlamacppCpuDockerImage:
<<: *BUILD_REPOSITORY
context:
<<: *BUILD_CONTEXT
image_size_baseline: 40000
device_type: &DEVICE_TYPE cpu
os_version: &OS_VERSION ubuntu24.04
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
transformers_version: &TRANSFORMERS_VERSION 4.57.3
tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *OS_VERSION ]
docker_file: !join [ docker/, *SHORT_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: sagemaker
build: true
enable_common_stage_build: false
test_configs:
test_platforms:
- sanity
- security
- sagemaker
Loading