diff --git a/README.md b/README.md
index b524772f..1ccb8c4f 100644
--- a/README.md
+++ b/README.md
@@ -214,6 +214,12 @@ Begin by setting up a Python 3.X environment with a recent, CUDA-enabled version
 pip install git+https://github.com/NVlabs/tiny-cuda-nn/#subdirectory=bindings/torch
 ```
 
+On headless, shared, or CPU-only Python environments, PyTorch may be unable to detect a target GPU during installation. In that case, set `TCNN_CUDA_ARCHITECTURES` explicitly before invoking `pip`, for example:
+```sh
+export TCNN_CUDA_ARCHITECTURES=86
+pip install git+https://github.com/NVlabs/tiny-cuda-nn/#subdirectory=bindings/torch
+```
+
 Alternatively, if you would like to install from a local clone of __tiny-cuda-nn__, invoke
 ```sh
 tiny-cuda-nn$ cd bindings/torch
diff --git a/bindings/torch/setup.py b/bindings/torch/setup.py
index 041936c4..9613d5af 100644
--- a/bindings/torch/setup.py
+++ b/bindings/torch/setup.py
@@ -2,17 +2,56 @@
 
 import re
 from setuptools import setup
-from pkg_resources import parse_version
+from packaging.version import parse as parse_version
 import subprocess
 import shutil
 import sys
 import torch
 from glob import glob
+
+if "CUDA_HOME" not in os.environ and "CUDA_PATH" in os.environ:
+	os.environ["CUDA_HOME"] = os.environ["CUDA_PATH"]
+
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 
 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
 ROOT_DIR = os.path.dirname(os.path.dirname(SCRIPT_DIR))
 
+HELP_FLAGS = {
+	"-h",
+	"--help",
+	"--help-commands",
+	"--name",
+	"--version",
+	"--fullname",
+	"--author",
+	"--author-email",
+	"--maintainer",
+	"--maintainer-email",
+	"--contact",
+	"--contact-email",
+	"--url",
+	"--license",
+	"--description",
+	"--long-description",
+	"--platforms",
+	"--classifiers",
+	"--keywords",
+	"--provides",
+	"--requires",
+	"--obsoletes",
+}
+BUILD_COMMANDS = {
+	"bdist",
+	"bdist_egg",
+	"bdist_wheel",
+	"build",
+	"build_ext",
+	"develop",
+	"editable_wheel",
+	"install",
+}
+
 def min_supported_compute_capability(cuda_version):
 	if cuda_version >= parse_version("13.0"):
 		return 75
@@ -33,6 +72,31 @@ def max_supported_compute_capability(cuda_version):
 	else:
 		return 120
 
+def should_build_extensions(argv):
+	if any(arg in HELP_FLAGS for arg in argv[1:]):
+		return False
+
+	commands = [arg for arg in argv[1:] if arg and not arg.startswith("-")]
+	return any(command in BUILD_COMMANDS or command.startswith("bdist_") for command in commands)
+
+def detect_compute_capabilities():
+	if "TCNN_CUDA_ARCHITECTURES" in os.environ and os.environ["TCNN_CUDA_ARCHITECTURES"]:
+		compute_capabilities = [int(x) for x in os.environ["TCNN_CUDA_ARCHITECTURES"].replace(";", ",").split(",")]
+		print(f"Obtained compute capabilities {compute_capabilities} from environment variable TCNN_CUDA_ARCHITECTURES")
+		return compute_capabilities
+
+	if torch.cuda.is_available():
+		major, minor = torch.cuda.get_device_capability()
+		compute_capabilities = [major * 10 + minor]
+		print(f"Obtained compute capability {compute_capabilities[0]} from PyTorch")
+		return compute_capabilities
+
+	raise EnvironmentError(
+		"Unknown compute capability for extension build. "
+		"Specify TCNN_CUDA_ARCHITECTURES manually (for example, 86 for RTX 30xx GPUs) "
+		"or install PyTorch with CUDA support so the target GPU can be detected automatically."
+	)
+
 # Find version of tinycudann by scraping CMakeLists.txt
 with open(os.path.join(ROOT_DIR, "CMakeLists.txt"), "r") as cmakelists:
 	for line in cmakelists.readlines():
@@ -41,18 +105,14 @@ def max_supported_compute_capability(cuda_version):
 			break
 
 print(f"Building PyTorch extension for tiny-cuda-nn version {VERSION}")
+build_extensions = should_build_extensions(sys.argv)
 
 ext_modules = []
-
-if "TCNN_CUDA_ARCHITECTURES" in os.environ and os.environ["TCNN_CUDA_ARCHITECTURES"]:
-	compute_capabilities = [int(x) for x in os.environ["TCNN_CUDA_ARCHITECTURES"].replace(";", ",").split(",")]
-	print(f"Obtained compute capabilities {compute_capabilities} from environment variable TCNN_CUDA_ARCHITECTURES")
-elif torch.cuda.is_available():
-	major, minor = torch.cuda.get_device_capability()
-	compute_capabilities = [major * 10 + minor]
-	print(f"Obtained compute capability {compute_capabilities[0]} from PyTorch")
+if build_extensions:
+	compute_capabilities = detect_compute_capabilities()
 else:
-	raise EnvironmentError("Unknown compute capability. Specify the target compute capabilities in the TCNN_CUDA_ARCHITECTURES environment variable or install PyTorch with the CUDA backend to detect it automatically.")
+	print("Skipping GPU architecture detection for metadata-only setup command.")
+	compute_capabilities = []
 
 include_networks = True
 if "--no-networks" in sys.argv:
@@ -60,7 +120,7 @@ def max_supported_compute_capability(cuda_version):
 	sys.argv.remove("--no-networks")
 	print("Building >> without << neural networks (just the input encodings)")
 
-if os.name == "nt":
+if build_extensions and os.name == "nt":
 	def find_cl_path():
 		import glob
 		for executable in ["Program Files (x86)", "Program Files"]:
@@ -84,7 +144,7 @@ def find_cl_path():
 cpp_standard = 14
 
 # Get CUDA version and make sure the targeted compute capability is compatible
-if os.system("nvcc --version") == 0:
+if build_extensions and os.system("nvcc --version") == 0:
 	nvcc_out = subprocess.check_output(["nvcc", "--version"]).decode()
 	cuda_version = re.search(r"release (\S+),", nvcc_out)
 
@@ -105,31 +165,6 @@ def find_cl_path():
 			print(f"WARNING: Compute capabilities {compute_capabilities} are not all supported by the installed CUDA version {cuda_version}. Targeting {supported_compute_capabilities} instead.")
 			compute_capabilities = supported_compute_capabilities
 
-min_compute_capability = min(compute_capabilities)
-
-print(f"Targeting C++ standard {cpp_standard}")
-
-base_nvcc_flags = [
-	f"-std=c++{cpp_standard}",
-	"--extended-lambda",
-	"--use_fast_math",
-	"--expt-relaxed-constexpr",
-	# The following definitions must be undefined
-	# since TCNN requires half-precision operation.
-	"-U__CUDA_NO_HALF_OPERATORS__",
-	"-U__CUDA_NO_HALF_CONVERSIONS__",
-	"-U__CUDA_NO_HALF2_OPERATORS__",
-]
-
-if os.name == "posix":
-	base_cflags = [f"-std=c++{cpp_standard}"]
-	base_nvcc_flags += [
-		"-Xcompiler=-Wno-float-conversion",
-		"-Xcompiler=-fno-strict-aliasing",
-	]
-elif os.name == "nt":
-	base_cflags = [f"/std:c++{cpp_standard}"]
-
 
 # Some containers set this to contain old architectures that won't compile. We only need the one installed in the machine.
 os.environ["TORCH_CUDA_ARCH_LIST"] = ""
@@ -138,45 +173,6 @@ def find_cl_path():
 bindings_dir = os.path.dirname(__file__)
 root_dir = os.path.abspath(os.path.join(bindings_dir, "../.."))
 
-base_definitions = [
-	# PyTorch-supplied parameters may be unaligned. TCNN must be made aware of this such that
-	# it does not optimize for aligned memory accesses.
-	"-DTCNN_PARAMS_UNALIGNED",
-	"-DTCNN_RTC",
-	"-DTCNN_RTC_USE_FAST_MATH",
-]
-
-if "TCNN_HALF_PRECISION" in os.environ:
-    enable_half = os.environ["TCNN_HALF_PRECISION"].lower() in ["1", "true", "on", "yes"]
-    base_definitions.append(f"-DTCNN_HALF_PRECISION={int(enable_half)}")
-    print(f"Forcing TCNN_HALF_PRECISION to {'ON' if enable_half else 'OFF'}")
-else:
-    if min_compute_capability == 61 or min_compute_capability <= 52:
-        enable_half = False
-    else:
-        enable_half = True
-    print(f"Auto-detecting TCNN_HALF_PRECISION: {'ON' if enable_half else 'OFF'} (Arch: {min_compute_capability})")
-base_definitions.append(f"-DTCNN_HALF_PRECISION={int(enable_half)}")
-
-base_source_files = [
-	"tinycudann/bindings.cpp",
-	"../../dependencies/fmt/src/format.cc",
-	"../../dependencies/fmt/src/os.cc",
-	"../../src/cpp_api.cu",
-	"../../src/common_host.cu",
-	"../../src/encoding.cu",
-	"../../src/object.cu",
-	"../../src/rtc_kernel.cu",
-]
-
-if include_networks:
-	base_source_files += [
-		"../../src/network.cu",
-		"../../src/cutlass_mlp.cu",
-	]
-else:
-	base_definitions.append("-DTCNN_NO_NETWORKS")
-
 # Copy headers required by RTC at runtime
 rtc_dir = os.path.join(bindings_dir, "tinycudann", "rtc")
 rtc_include_dir = os.path.join(rtc_dir, "include")
@@ -186,9 +182,9 @@ def find_cl_path():
 os.makedirs(rtc_cache_dir, exist_ok=True)
 
 nvcc_path = shutil.which("nvcc")
-if nvcc_path is None:
+if build_extensions and nvcc_path is None:
 	print(f"WARNING: could not find CUDA include directory. JIT compilation will not be supported.")
-else:
+if nvcc_path is not None:
 	cuda_include_dir = os.path.join(os.path.dirname(os.path.dirname(nvcc_path)), "include")
 
 	cuda_headers = glob(f"{cuda_include_dir}/cuda_fp16*") + glob(f"{cuda_include_dir}/vector*")
@@ -207,34 +203,102 @@ def copy_files(whence, files):
 	copy_files(f"{root_dir}/include", tcnn_headers)
 	copy_files(f"{root_dir}/dependencies", pcg32_headers)
 
-def make_extension(compute_capability):
-	nvcc_flags = base_nvcc_flags + [f"-gencode=arch=compute_{compute_capability},code={code}_{compute_capability}" for code in ["compute", "sm"]]
-	definitions = base_definitions + [f"-DTCNN_MIN_GPU_ARCH={compute_capability}"]
+cmdclass = {}
+
+if build_extensions:
+	min_compute_capability = min(compute_capabilities)
+
+	print(f"Targeting C++ standard {cpp_standard}")
+
+	base_nvcc_flags = [
+		f"-std=c++{cpp_standard}",
+		"--extended-lambda",
+		"--use_fast_math",
+		"--expt-relaxed-constexpr",
+		# The following definitions must be undefined
+		# since TCNN requires half-precision operation.
+		"-U__CUDA_NO_HALF_OPERATORS__",
+		"-U__CUDA_NO_HALF_CONVERSIONS__",
+		"-U__CUDA_NO_HALF2_OPERATORS__",
+	]
+
+	if os.name == "posix":
+		base_cflags = [f"-std=c++{cpp_standard}"]
+		base_nvcc_flags += [
+			"-Xcompiler=-Wno-float-conversion",
+			"-Xcompiler=-fno-strict-aliasing",
+		]
+	elif os.name == "nt":
+		base_cflags = [f"/std:c++{cpp_standard}"]
+
+	base_definitions = [
+		# PyTorch-supplied parameters may be unaligned. TCNN must be made aware of this such that
+		# it does not optimize for aligned memory accesses.
+		"-DTCNN_PARAMS_UNALIGNED",
+		"-DTCNN_RTC",
+		"-DTCNN_RTC_USE_FAST_MATH",
+	]
 
-	if include_networks and compute_capability > 70:
-		source_files = base_source_files + ["../../src/fully_fused_mlp.cu"]
+	if "TCNN_HALF_PRECISION" in os.environ:
+		enable_half = os.environ["TCNN_HALF_PRECISION"].lower() in ["1", "true", "on", "yes"]
+		base_definitions.append(f"-DTCNN_HALF_PRECISION={int(enable_half)}")
+		print(f"Forcing TCNN_HALF_PRECISION to {'ON' if enable_half else 'OFF'}")
 	else:
-		source_files = base_source_files
-
-	nvcc_flags = nvcc_flags + definitions
-	cflags = base_cflags + definitions
-
-	ext = CUDAExtension(
-		name=f"tinycudann_bindings._{compute_capability}_C",
-		sources=source_files,
-		include_dirs=[
-			f"{root_dir}/include",
-			f"{root_dir}/dependencies",
-			f"{root_dir}/dependencies/cutlass/include",
-			f"{root_dir}/dependencies/cutlass/tools/util/include",
-			f"{root_dir}/dependencies/fmt/include",
-		],
-		extra_compile_args={"cxx": cflags, "nvcc": nvcc_flags},
-		libraries=["cuda", "nvrtc"],
-	)
-	return ext
+		if min_compute_capability == 61 or min_compute_capability <= 52:
+			enable_half = False
+		else:
+			enable_half = True
+		print(f"Auto-detecting TCNN_HALF_PRECISION: {'ON' if enable_half else 'OFF'} (Arch: {min_compute_capability})")
+	base_definitions.append(f"-DTCNN_HALF_PRECISION={int(enable_half)}")
+
+	base_source_files = [
+		"tinycudann/bindings.cpp",
+		"../../dependencies/fmt/src/format.cc",
+		"../../dependencies/fmt/src/os.cc",
+		"../../src/cpp_api.cu",
+		"../../src/common_host.cu",
+		"../../src/encoding.cu",
+		"../../src/object.cu",
+		"../../src/rtc_kernel.cu",
+	]
 
-ext_modules = [make_extension(comp) for comp in compute_capabilities]
+	if include_networks:
+		base_source_files += [
+			"../../src/network.cu",
+			"../../src/cutlass_mlp.cu",
+		]
+	else:
+		base_definitions.append("-DTCNN_NO_NETWORKS")
+
+	def make_extension(compute_capability):
+		nvcc_flags = base_nvcc_flags + [f"-gencode=arch=compute_{compute_capability},code={code}_{compute_capability}" for code in ["compute", "sm"]]
+		definitions = base_definitions + [f"-DTCNN_MIN_GPU_ARCH={compute_capability}"]
+
+		if include_networks and compute_capability > 70:
+			source_files = base_source_files + ["../../src/fully_fused_mlp.cu"]
+		else:
+			source_files = base_source_files
+
+		nvcc_flags = nvcc_flags + definitions
+		cflags = base_cflags + definitions
+
+		ext = CUDAExtension(
+			name=f"tinycudann_bindings._{compute_capability}_C",
+			sources=source_files,
+			include_dirs=[
+				f"{root_dir}/include",
+				f"{root_dir}/dependencies",
+				f"{root_dir}/dependencies/cutlass/include",
+				f"{root_dir}/dependencies/cutlass/tools/util/include",
+				f"{root_dir}/dependencies/fmt/include",
+			],
+			extra_compile_args={"cxx": cflags, "nvcc": nvcc_flags},
+			libraries=["cuda", "nvrtc"],
+		)
+		return ext
+
+	ext_modules = [make_extension(comp) for comp in compute_capabilities]
+	cmdclass = {"build_ext": BuildExtension}
 
 def package_files(directory):
 	paths = []
@@ -273,5 +337,5 @@ def package_files(directory):
 	include_package_data=True,
 	zip_safe=False,
 	ext_modules=ext_modules,
-	cmdclass={"build_ext": BuildExtension}
+	cmdclass=cmdclass
 )