aws · ehcalabres · Apr 6, 2026 · Apr 24, 2026 · Apr 27, 2026 · Apr 27, 2026
@@ -1,6 +1,6 @@
 [dev]
 # Set to "huggingface", for example, if you are a huggingface developer. Default is ""
-partner_developer = ""
+partner_developer = "huggingface"
 # Please only set it to true if you are preparing an EI related PR
 # Do remember to revert it back to false before merging any PR (including EI dedicated PR)
 ei_mode = false
@@ -36,12 +36,12 @@ deep_canary_mode = false
 
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
-# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_sglang", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_sglang", "huggingface_llamacpp", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
+build_frameworks = ["huggingface_llamacpp"]
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
-build_training = true
+build_training = false
 build_inference = true
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
@@ -192,5 +192,8 @@ dlc-pr-huggingface-vllm = ""
 # HuggingFace SGLang
 dlc-pr-huggingface-sglang = ""
 
+# Huggingface Llamacpp
+dlc-pr-huggingface-llamacpp = "/huggingface/llamacpp/buildspec.yml"
+
 # sglang
 dlc-pr-sglang = ""
@@ -0,0 +1,133 @@
+--- a/tools/server/server.cpp
++++ b/tools/server/server.cpp
+@@ -11,7 +11,9 @@
+ #include "llama.h"
+ #include "log.h"
+
++#include <algorithm>
+ #include <atomic>
++#include <cctype>
+ #include <clocale>
+ #include <exception>
+ #include <signal.h>
+@@ -69,6 +71,81 @@
+         }
+         return res;
+     };
++}
++
++static std::string sagemaker_header(const server_http_req & req, const std::string & name) {
++    for (const auto & h : req.headers) {
++        std::string key = h.first;
++        std::transform(key.begin(), key.end(), key.begin(), [](unsigned char c) { return std::tolower(c); });
++        if (key == name) {
++            return h.second;
++        }
++    }
++    return "";
++}
++
++static std::string sagemaker_route_from_attrs(const server_http_req & req) {
++    const std::string attrs = sagemaker_header(req, "x-amzn-sagemaker-custom-attributes");
++    const std::string key = "route=";
++    const size_t pos = attrs.find(key);
++    if (pos == std::string::npos) {
++        return "";
++    }
++    const size_t start = pos + key.size();
++    const size_t end = attrs.find_first_of(",; \t\r\n", start);
++    return attrs.substr(start, end == std::string::npos ? std::string::npos : end - start);
++}
++
++static bool sagemaker_route_syntax_ok(const std::string & route) {
++    return !route.empty() && route[0] == '/' && route.find("..") == std::string::npos &&
++           route.find("://") == std::string::npos && route.find('?') == std::string::npos &&
++           route.find('#') == std::string::npos;
++}
++
++static std::string sagemaker_default_route(const server_http_req & req) {
++    const json body = json::parse(req.body, nullptr, false);
++    if (body.is_object()) {
++        if (body.contains("messages")) {
++            return "/v1/chat/completions";
++        }
++        if (body.contains("prompt")) {
++            return "/v1/completions";
++        }
++        if (body.contains("input")) {
++            return "/v1/embeddings";
++        }
++    }
++    return "/v1/chat/completions";
++}
++
++static server_http_res_ptr sagemaker_error(int status, const std::string & message) {
++    auto res = std::make_unique<server_http_res>();
++    res->status = status;
++    res->data = safe_json_to_str({
++        { "error", {
++            { "code", status },
++            { "message", message },
++            { "type", "invalid_request_error" },
++        } },
++    });
++    return res;
++}
++
++static server_http_res_ptr sagemaker_invocations(
++        const server_http_req & req,
++        const std::map<std::string, server_http_context::handler_t> & routes) {
++    const std::string requested = sagemaker_route_from_attrs(req);
++    const std::string route = requested.empty() ? sagemaker_default_route(req) : requested;
++    if (!sagemaker_route_syntax_ok(route)) {
++        return sagemaker_error(400, "invalid SageMaker route: " + route);
++    }
++    const auto it = routes.find(route);
++    if (it == routes.end()) {
++        return sagemaker_error(400, "unsupported SageMaker route: " + route);
++    }
++    server_http_req routed_req = req;
++    routed_req.path = route;
++    return it->second(routed_req);
+ }
+
+ int main(int argc, char ** argv) {
+@@ -169,6 +246,38 @@
+         ctx_http.post("/models/unload",        ex_wrapper(models_routes->post_router_models_unload));
+     }
+
++
++    const std::map<std::string, server_http_context::handler_t> sagemaker_routes = {
++        {"/props", routes.post_props},
++        {"/completion", routes.post_completions},
++        {"/completions", routes.post_completions},
++        {"/v1/completions", routes.post_completions_oai},
++        {"/chat/completions", routes.post_chat_completions},
++        {"/v1/chat/completions", routes.post_chat_completions},
++        {"/v1/responses", routes.post_responses_oai},
++        {"/responses", routes.post_responses_oai},
++        {"/v1/audio/transcriptions", routes.post_transcriptions_oai},
++        {"/audio/transcriptions", routes.post_transcriptions_oai},
++        {"/v1/messages", routes.post_anthropic_messages},
++        {"/v1/messages/count_tokens", routes.post_anthropic_count_tokens},
++        {"/infill", routes.post_infill},
++        {"/embedding", routes.post_embeddings},
++        {"/embeddings", routes.post_embeddings},
++        {"/v1/embeddings", routes.post_embeddings_oai},
++        {"/rerank", routes.post_rerank},
++        {"/reranking", routes.post_rerank},
++        {"/v1/rerank", routes.post_rerank},
++        {"/v1/reranking", routes.post_rerank},
++        {"/tokenize", routes.post_tokenize},
++        {"/detokenize", routes.post_detokenize},
++        {"/apply-template", routes.post_apply_template},
++        {"/lora-adapters", routes.post_lora_adapters},
++    };
++
++    ctx_http.get ("/ping", ex_wrapper(routes.get_health)); // SageMaker health endpoint
++    ctx_http.post("/invocations", ex_wrapper([&sagemaker_routes](const server_http_req & req) {
++        return sagemaker_invocations(req, sagemaker_routes);
++    }));
+     ctx_http.get ("/health",                   ex_wrapper(routes.get_health)); // public endpoint (no API key check)
+     ctx_http.get ("/v1/health",                ex_wrapper(routes.get_health)); // public endpoint (no API key check)
+     ctx_http.get ("/metrics",                  ex_wrapper(routes.get_metrics));
@@ -0,0 +1,53 @@
+#!/bin/bash
+set -euo pipefail
+
+# Check if telemetry file exists before executing
+# Execute telemetry script if it exists, suppress errors
+bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true
+
+# Source CUDA compat for older drivers (e.g., g5 instances)
+if [ -f /usr/local/bin/start_cuda_compat.sh ] \
+    && command -v nvidia-smi >/dev/null 2>&1 \
+    && command -v nvcc >/dev/null 2>&1; then
+    source /usr/local/bin/start_cuda_compat.sh
+fi
+
+# SageMaker sends traffic to port 8080 on /ping and /invocations. The custom
+# llama-server build handles those routes directly.
+HOST="${LLAMACPP_SAGEMAKER_HOST:-0.0.0.0}"
+PORT="${SAGEMAKER_BIND_TO_PORT:-${LLAMACPP_SAGEMAKER_PORT:-8080}}"
+
+PREFIX="SM_LLAMACPP_"
+ARG_PREFIX="--"
+
+ARGS=()
+
+while IFS='=' read -r key value; do
+    arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+
+    ARGS+=("${ARG_PREFIX}${arg_name}")
+    if [ -n "$value" ]; then
+        ARGS+=("$value")
+    fi
+done < <(env | grep "^${PREFIX}" || true)
+
+# Drop any user-supplied --host / --port so SageMaker can always reach the server.
+normalized=()
+skip_next=0
+for a in "${ARGS[@]}"; do
+    if [ "$skip_next" -eq 1 ]; then
+        skip_next=0
+        continue
+    fi
+    if [ "$a" = "--host" ] || [ "$a" = "--port" ]; then
+        skip_next=1
+        continue
+    fi
+    normalized+=("$a")
+done
+ARGS=("${normalized[@]}")
+ARGS+=(--host "$HOST" --port "$PORT")
+
+echo "[sagemaker] llama-server args: ${ARGS[*]}" >&2
+
+exec /app/llama-server "${ARGS[@]}"
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+verlte() {
+  [ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
+}
+
+COMPAT_FILE=/usr/local/cuda/compat/libcuda.so.1
+if [ -f $COMPAT_FILE ]; then
+  CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink $COMPAT_FILE | cut -d'.' -f 3-)
+  echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}"
+  NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
+  if [ -z "$NVIDIA_DRIVER_VERSION" ]; then
+    NVIDIA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>/dev/null || true)
+  fi
+  echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}"
+  if verlte $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then
+    echo "Adding CUDA compat to LD_LIBRARY_PATH"
+    export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
+    echo $LD_LIBRARY_PATH
+  else
+    echo "Skipping CUDA compat setup as newer NVIDIA driver is installed"
+  fi
+else
+  echo "Skipping CUDA compat setup as package not found"
+fi
@@ -0,0 +1,78 @@
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+prod_account_id: &PROD_ACCOUNT_ID 763104351884
+region: &REGION <set-$REGION-in-environment>
+base_framework: &BASE_FRAMEWORK llamacpp
+framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
+version: &VERSION "b8882"
+short_version: &SHORT_VERSION "b8882"
+arch_type: &ARCH_TYPE x86_64
+autopatch_build: "False"
+
+repository_info:
+  build_repository: &BUILD_REPOSITORY
+    image_type: &IMAGE_TYPE inference
+    root: huggingface/llamacpp
+    repository_name: &REPOSITORY_NAME !join [ "pr", "-", "huggingface", "-", *BASE_FRAMEWORK ]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+    release_repository_name: &RELEASE_REPOSITORY_NAME !join [ "huggingface", "-", *BASE_FRAMEWORK ]
+    release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
+
+context:
+  build_context: &BUILD_CONTEXT
+    deep_learning_container:
+      source: ../../src/deep_learning_container.py
+      target: deep_learning_container.py
+    start_cuda_compat:
+      source: build_artifacts/start_cuda_compat.sh
+      target: start_cuda_compat.sh
+    sagemaker_entrypoint:
+      source: build_artifacts/sagemaker_entrypoint.sh
+      target: sagemaker_entrypoint.sh
+    llamacpp_sagemaker_server_patch:
+      source: build_artifacts/llamacpp_sagemaker_server.patch
+      target: llamacpp_sagemaker_server.patch
+
+
+images:
+  BuildHuggingFaceLlamacppGpuCu130DockerImage:
+    <<: *BUILD_REPOSITORY
+    context:
+      <<: *BUILD_CONTEXT
+    image_size_baseline: 40000
+    device_type: &DEVICE_TYPE gpu
+    cuda_version: &CUDA_VERSION cu130
+    os_version: &OS_VERSION ubuntu24.04
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
+    transformers_version: &TRANSFORMERS_VERSION 4.57.3
+    tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    target: sagemaker
+    build: true
+    enable_common_stage_build: false
+    test_configs:
+      test_platforms:
+        - sanity
+        - security
+        - sagemaker
+
+  BuildHuggingFaceLlamacppCpuDockerImage:
+    <<: *BUILD_REPOSITORY
+    context:
+      <<: *BUILD_CONTEXT
+    image_size_baseline: 40000
+    device_type: &DEVICE_TYPE cpu
+    os_version: &OS_VERSION ubuntu24.04
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
+    transformers_version: &TRANSFORMERS_VERSION 4.57.3
+    tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *OS_VERSION ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    target: sagemaker
+    build: true
+    enable_common_stage_build: false
+    test_configs:
+      test_platforms:
+        - sanity
+        - security
+        - sagemaker