Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ models/**

# Python local env
venv/
.venv/
.venv/
**/trace-*
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ encase = { git = "https://github.com/cwfitzgerald/encase", branch = "add-member"
env_logger = "0.11.3"
fern = "0.6.2"
getrandom = "0.2"
glam = "0.27.0"
glam = "0.28.0"
globwalk = "0.8.1"
gloo-net = { version = "0.5.0", default-features = false }
hound = "3.5.1"
Expand Down Expand Up @@ -93,3 +93,4 @@ wasm-bindgen-futures = "0.4.42"
web-sys = "0.3.69"
web-time = "1.0.0"
futures-intrusive = "0.5.0"
include_dir = "0.7.4"
19 changes: 12 additions & 7 deletions crates/ratchet-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@ version = "0.1.0"
edition = "2021"

[features]
default = ["rand", "testing"]
default = ["rand", "testing", "trace"]
gpu-profiling = ["dep:tabled", "dep:itertools"]
rand = ["dep:rand", "dep:rand_distr"]
plotting = ["dep:dot3", "dep:tempfile"]
testing = ["dep:npyz", "dep:ndarray"]
pyo3 = ["dep:pyo3", "dep:numpy", "dep:regex"]
debug = [] #dump every node
trace = ["dep:uuid", "dep:web-time"]

[build-dependencies]
tera = { workspace = true }
Expand All @@ -34,19 +34,18 @@ log = { workspace = true }
thiserror = { workspace = true }
serde = { workspace = true, features = ["derive"] }
anyhow.workspace = true

smallvec = { workspace = true , features = ["serde"] }
rustc-hash = { workspace = true }
slotmap = { workspace = true }
parking_lot = { workspace = true }
smallvec = { workspace = true }
encase = { workspace = true, features = ["smallvec", "glam"] }
pollster = { workspace = true }
getrandom = { workspace = true, features = ["js"] } # Needed for wasm support in `num` trait
num = { workspace = true }
rand_distr = { workspace = true, optional = true }
rand = { workspace = true, optional = true }
glam = { workspace = true }
npyz = { workspace = true, optional = true }
npyz = { workspace = true, optional = true, features=["half"] }
ndarray = { workspace = true, optional = true }

#Plotting
Expand All @@ -61,13 +60,19 @@ pyo3 = { workspace = true, features = ["auto-initialize"], optional = true }
regex = { workspace = true, optional = true }
numpy = { workspace = true, optional = true, features=["half"]}

# Trace
uuid = { version="1.1.0", features = ["v4", "fast-rng"], optional = true }
web-time = { workspace = true, optional = true }
serde_json.workspace = true


[target.'cfg(target_arch = "wasm32")'.dependencies]
wasm-bindgen.workspace = true
futures-intrusive.workspace = true
wasm-bindgen-futures.workspace = true

async-trait = "0.1.77"
smallvec = { workspace = true , features = ["serde"] }
include_dir.workspace = true
smallvec = { workspace = true }

[dev-dependencies]
env_logger = { workspace = true }
Expand Down
6 changes: 4 additions & 2 deletions crates/ratchet-core/src/compiled_op.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ use crate::gpu::{
};
use crate::{drvec, rvec, KernelKey, OperationError, RVec, Tensor};
use derive_new::new;
#[cfg(feature = "trace")]
use std::sync::Arc;
use wgpu::DynamicOffset;

//Compiled op represents a single kernel invocation
Expand All @@ -16,8 +18,8 @@ pub struct CompiledOp {
storage_groups: RVec<GpuBindGroup>,
offset: DynamicOffset, //offset into the metadata uniform buffer
pub kernel_key: KernelKey,
#[cfg(feature = "debug")]
pub debug_buffer: Option<Arc<wgpu::Buffer>>,
#[cfg(feature = "trace")]
pub trace_buffer: Option<Arc<wgpu::Buffer>>,
}

impl CompiledOp {
Expand Down
7 changes: 7 additions & 0 deletions crates/ratchet-core/src/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,13 @@ impl Device {
format!("{:?}", self)
}

pub fn device_identifier(&self) -> String {
match self {
Device::CPU => "CPU".to_string(),
Device::GPU(gpu) => gpu.info().device_identifier(),
}
}

pub fn try_gpu(&self) -> Result<&WgpuDevice, DeviceError> {
match self {
Device::GPU(gpu) => Ok(gpu),
Expand Down
55 changes: 18 additions & 37 deletions crates/ratchet-core/src/executable.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
use crate::gpu::{GpuUniform, PoolError, StaticResourcePoolAccessor, WgpuDevice};
use crate::CompiledOp;
use derive_new::new;
#[cfg(not(feature = "debug"))]
use std::marker::PhantomData;
use wgpu::SubmissionIndex;

#[cfg(not(feature = "trace"))]
use std::marker::PhantomData;

#[cfg(feature = "trace")]
use crate::Tensor;

/// # Executable
///
/// A linear sequence of compiled operations, with a single uniform buffer
/// containing metadata for all operations.
#[derive(new)]
pub struct Executable<'t> {
steps: Vec<CompiledOp>,
gpu_uniform: GpuUniform,
#[cfg(feature = "debug")]
debug_list: Vec<&'t Tensor>,
#[cfg(not(feature = "debug"))]
pub(crate) steps: Vec<CompiledOp>,
pub(crate) gpu_uniform: GpuUniform,
#[cfg(feature = "trace")]
pub(crate) trace_list: Vec<&'t Tensor>,
#[cfg(not(feature = "trace"))]
_phantom: PhantomData<&'t ()>,
}

Expand Down Expand Up @@ -58,15 +62,13 @@ impl Executable<'_> {
Ok(device.queue().submit(Some(encoder.finish())))
}

#[cfg(feature = "debug")]
pub(crate) fn dispatch_debugging(
#[cfg(feature = "trace")]
pub(crate) fn dispatch_trace(
&self,
device: &WgpuDevice,
) -> Result<SubmissionIndex, ExecutionError> {
use crate::{wgpu_buffer_to_cpu_buffer, DeviceStorage};

let pipeline_resources = device.pipeline_resources();
assert!(self.debug_list.len() == self.steps.len());
assert!(self.trace_list.len() == self.steps.len());

let mut last_index = None;
for (step_index, step) in self.steps.iter().enumerate() {
Expand All @@ -92,7 +94,7 @@ impl Executable<'_> {
cpass.dispatch_workgroups(x_count, y_count, z_count);
}

let result_t = self.debug_list[step_index].clone();
let result_t = self.trace_list[step_index].clone();
let gpu_storage = result_t.storage();
let result_buf = &gpu_storage
.as_ref()
Expand All @@ -101,39 +103,18 @@ impl Executable<'_> {
.map_err(|_| ExecutionError::DebuggingError("Failed to get result buf."))?
.inner;

let debug_buffer = step
.debug_buffer
let trace_buffer = step
.trace_buffer
.as_ref()
.ok_or(ExecutionError::DebuggingError(
"Failed to get debug buffer.",
))?;
encoder.copy_buffer_to_buffer(result_buf, 0, debug_buffer, 0, debug_buffer.size());
encoder.copy_buffer_to_buffer(result_buf, 0, trace_buffer, 0, trace_buffer.size());

let index = device.queue().submit(Some(encoder.finish()));
last_index = Some(index);
}

//Dump all of our debug results
for (si, step) in self.steps.iter().enumerate() {
let d = device.clone();
let dt = self.debug_list[si].dt();
let debug_buffer = step.debug_buffer.clone().unwrap();
let alignment = dt.size_of();
let kernel_key = step.kernel_key.clone();
#[cfg(target_arch = "wasm32")]
{
wasm_bindgen_futures::spawn_local(async move {
let cpu_buf = wgpu_buffer_to_cpu_buffer(&debug_buffer, alignment, d).await;
log::debug!("{}: {}\n {:?}\n", si, kernel_key, cpu_buf.dump(dt, false));
});
}
#[cfg(not(target_arch = "wasm32"))]
{
let cpu_buf = wgpu_buffer_to_cpu_buffer(&debug_buffer, alignment, &d);
log::debug!("{}: {}\n {:?}\n", si, kernel_key, cpu_buf.dump(dt, false));
}
}

Ok(last_index.unwrap())
}

Expand Down
39 changes: 39 additions & 0 deletions crates/ratchet-core/src/gpu/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pub struct WgpuDevice {
pipeline_layout_pool: Arc<PipelineLayoutPool>,
compute_pipeline_pool: Arc<ComputePipelinePool>,
kernel_module_pool: Arc<KernelModulePool>,
device_info: DeviceInfo,
device_limits: DeviceLimits,
device_features: DeviceFeatures,
device: Arc<wgpu::Device>,
Expand Down Expand Up @@ -76,6 +77,7 @@ impl WgpuDevice {
max_compute_invocations_per_workgroup: 1024,
..Default::default()
},
memory_hints: wgpu::MemoryHints::default(),
};
let device_request = adapter.request_device(&device_descriptor, None).await;
let (device, queue) = if let Err(e) = device_request {
Expand All @@ -93,6 +95,7 @@ impl WgpuDevice {
log::warn!("Forcing F32 precision");
features.SHADER_F16 = false;
}
features.SHADER_F16 = false;

if std::env::var("RATCHET_DISABLE_SUBGROUPS").is_ok() {
log::warn!("Disabling subgroup support");
Expand All @@ -111,6 +114,7 @@ impl WgpuDevice {
kernel_module_pool: Arc::new(KernelModulePool::new()),
compute_pipeline_pool: Arc::new(ComputePipelinePool::new()),
device: Arc::new(device),
device_info: adapter.get_info().into(),
device_limits: limits,
device_features: features,
})
Expand Down Expand Up @@ -166,6 +170,10 @@ impl WgpuDevice {
pub fn limits(&self) -> &DeviceLimits {
&self.device_limits
}

pub fn info(&self) -> &DeviceInfo {
&self.device_info
}
}

impl WgpuDevice {
Expand Down Expand Up @@ -283,6 +291,37 @@ impl WgpuDevice {
}
}

#[derive(Clone)]
pub struct DeviceInfo {
pub name: String,
pub vendor: u32,
pub device: u32,
pub device_type: wgpu::DeviceType,
pub driver: String,
pub driver_info: String,
pub backend: wgpu::Backend,
}

impl DeviceInfo {
pub fn device_identifier(&self) -> String {
format!("{}-{}", self.name.replace(" ", "-"), self.backend.to_str())
}
}

impl From<wgpu::AdapterInfo> for DeviceInfo {
fn from(info: wgpu::AdapterInfo) -> Self {
DeviceInfo {
name: info.name,
vendor: info.vendor,
device: info.device,
device_type: info.device_type,
driver: info.driver,
driver_info: info.driver_info,
backend: info.backend,
}
}
}

#[derive(Clone)]
pub struct DeviceLimits {
pub max_bind_groups: u32,
Expand Down
2 changes: 2 additions & 0 deletions crates/ratchet-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ mod storage;
mod strides;
mod tensor;
mod tensor_id;
mod trace;

pub use compiled_op::*;
pub use device::*;
Expand All @@ -31,6 +32,7 @@ pub use storage::*;
pub use strides::*;
pub use tensor::*;
pub use tensor_id::*;
pub use trace::*;

#[cfg(feature = "plotting")]
pub use plot::render_to_file;
Expand Down
26 changes: 12 additions & 14 deletions crates/ratchet-core/src/op.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ use crate::{
use std::borrow::Cow;
use std::fmt::Debug;

#[cfg(feature = "trace")]
use {crate::gpu::BufferUsagesExt, std::sync::Arc};

#[derive(Clone, Debug)]
#[non_exhaustive]
pub enum LazyOp {
Expand Down Expand Up @@ -283,7 +286,6 @@ pub trait GPUOperation: Operation {
uniform: &mut CpuUniform,
device: &WgpuDevice,
can_inplace: bool,
debug: bool,
) -> Result<CompiledOp, OperationError> {
let kernel = self.select_kernel();

Expand Down Expand Up @@ -338,26 +340,22 @@ pub trait GPUOperation: Operation {
can_inplace,
)?;

#[cfg(feature = "debug")]
let debug_buffer = if debug {
Some(Arc::new(device.create_buffer(&wgpu::BufferDescriptor {
label: Some("debug buffer"),
size: dst.num_bytes() as _,
usage: wgpu::BufferUsages::standard(),
mapped_at_creation: false,
})))
} else {
None
};
#[cfg(feature = "trace")]
let trace_buffer = Some(Arc::new(device.create_buffer(&wgpu::BufferDescriptor {
label: Some("debug buffer"),
size: dst.num_bytes() as _,
usage: wgpu::BufferUsages::standard(),
mapped_at_creation: false,
})));

Ok(CompiledOp::new(
pipeline_handle,
workload.workgroup_count,
storage_bind_groups,
offset as _,
kernel_src_desc.key,
#[cfg(feature = "debug")]
debug_buffer,
#[cfg(feature = "trace")]
trace_buffer,
))
}
}
Loading