Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
9d1e37a
Make the f64 codelet operate entirely in registers
Shnatsel Apr 11, 2026
eb26222
Make f32 codelet operate entirely in registers
Shnatsel Apr 11, 2026
3aedc25
Delete old, superseded codelets
Shnatsel Apr 11, 2026
2345b58
Properly vectorize f32 codelet
Shnatsel Apr 11, 2026
636e501
Reduce live set in f64 codelet to reduce register pressure
Shnatsel Apr 11, 2026
a2f7c2a
Another attempt at massaging f64 assembly; didn't work
Shnatsel Apr 11, 2026
1356ff2
Revert "Another attempt at massaging f64 assembly; didn't work"
Shnatsel Apr 11, 2026
bb293bc
Address the register spills in f32 codelet passes 0 and 1
Shnatsel Apr 11, 2026
ee3958c
Tighten up stage 2 assembly in f32 codelet
Shnatsel Apr 11, 2026
35951fd
Add a comment on register splilling
Shnatsel Apr 17, 2026
34a7b3d
Merge branch 'main' into in-registers-codelet
Shnatsel Apr 18, 2026
c69ca7c
Use codelets unconditionally
Shnatsel Apr 18, 2026
c065e53
Drop unused import
Shnatsel Apr 18, 2026
62d7eed
Remove codelet control from planner now that it is always beneficial
Shnatsel Apr 18, 2026
ff6d8d0
Mark mode as unused to suppress compiler warning (for now)
Shnatsel Apr 18, 2026
6e9d213
Replace zip_low()+zip_high() with interleave(), should have better pe…
Shnatsel Apr 18, 2026
5cfd39a
Reduce f64 codelet from 5 stages to 4, to reduce register pressure in…
Shnatsel Apr 18, 2026
af675da
Polyfill interleave() until the upstream fearless_simd PR is merged
Shnatsel Apr 18, 2026
d70d7eb
Merge branch 'main' into in-registers-codelet
Shnatsel Apr 18, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,4 @@ debug = true
[package.metadata.docs.rs]
all-features = true

[lints.rust]
[lints.rust]
23 changes: 16 additions & 7 deletions src/algorithms/dit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
use fearless_simd::{dispatch, Simd};

use crate::algorithms::bravo::{bit_rev_bravo_f32, bit_rev_bravo_f64};
use crate::kernels::codelets::{fft_dit_codelet_32_f32, fft_dit_codelet_32_f64};
use crate::kernels::codelets::{fft_dit_codelet_16_f64, fft_dit_codelet_32_f32};
use crate::kernels::dit::*;
use crate::options::Options;
use crate::parallel::run_maybe_in_parallel;
Expand All @@ -42,10 +42,11 @@ fn recursive_dit_fft_f64<S: Simd>(
let log_size = size.ilog2() as usize;

if size <= L1_BLOCK_SIZE {
// Use FFT-32 codelet to fuse stages 0-4 into a single pass per 32-element chunk
let start_stage = if planner.use_codelet_32 {
fft_dit_codelet_32_f64(simd, &mut reals[..size], &mut imags[..size]);
5
// Use FFT-16 codelet to fuse stages 0-3 into a single pass per 16-element chunk
let codelet_stages = 4;
let start_stage = if stage_twiddle_idx == 0 && size >= power_of_two(codelet_stages) {
fft_dit_codelet_16_f64(simd, &mut reals[..size], &mut imags[..size]);
codelet_stages
} else {
0
};
Expand Down Expand Up @@ -109,9 +110,10 @@ fn recursive_dit_fft_f32<S: Simd>(

if size <= L1_BLOCK_SIZE {
// Use FFT-32 codelet to fuse stages 0-4 into a single pass per 32-element chunk
let start_stage = if planner.use_codelet_32 {
let codelet_stages = 5;
let start_stage = if stage_twiddle_idx == 0 && size >= power_of_two(codelet_stages) {
fft_dit_codelet_32_f32(simd, &mut reals[..size], &mut imags[..size]);
5
codelet_stages
} else {
0
};
Expand Down Expand Up @@ -392,3 +394,10 @@ fn fft_32_dit_with_planner_and_opts_impl<S: Simd>(
}
}
}

#[inline]
fn power_of_two(power: usize) -> usize {
// 2.pow() requires a lot of ugly type annotations so here's a helper function
debug_assert!(power < usize::BITS as usize);
1 << power
}
Loading
Loading