diff --git a/candle-core/src/cpu_backend.rs b/candle-core/src/cpu_backend.rs
index c3ec8249..01ccfde7 100644
--- a/candle-core/src/cpu_backend.rs
+++ b/candle-core/src/cpu_backend.rs
@@ -2,6 +2,7 @@ use crate::backend::{BackendDevice, BackendStorage};
 use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
 use crate::{DType, Error, IntDType, Layout, Result, Shape, WithDType};
 use half::{bf16, f16};
+use rayon::prelude::*;
 
 // TODO: Maybe we should not implement [Clone] here and instead have an explicit allocator +
 // intercept the oom errors to avoid panicking and provide a proper error.
@@ -1052,10 +1053,8 @@ impl<'a> Map2 for Conv1D<'a> {
             }
         }
 
-        let num_threads = crate::utils::get_num_threads();
-
         for offset in 0..p.k_size {
-            crate::cpu::kernels::par_range(0, p.c_out, num_threads, |dst_c_idx| {
+            (0..p.c_out).into_par_iter().for_each(|dst_c_idx| {
                 let dst_idx = dst_c_idx * l_out;
                 let k_cont = (0..p.c_in)
                     .map(|c_in_idx| k[dst_c_idx * k_s0 + c_in_idx * k_s1 + offset * k_s2])
@@ -1123,11 +1122,9 @@ impl<'a> Map2 for Conv2D<'a> {
             }
         }
 
-        let num_threads = crate::utils::get_num_threads();
-
         for offset_h in 0..p.k_h {
             for offset_w in 0..p.k_w {
-                crate::cpu::kernels::par_range(0, p.c_out, num_threads, |dst_c_idx| {
+                (0..p.c_out).into_par_iter().for_each(|dst_c_idx| {
                     let dst_idx = dst_c_idx * out_w * out_h;
                     let k_cont = (0..p.c_in)
                         .map(|c_in_idx| {
@@ -1216,11 +1213,10 @@ impl<'a> Map2 for ConvTranspose2D<'a> {
                 }
             }
         }
-        let num_threads = crate::utils::get_num_threads();
 
         for k_y in 0..p.k_h {
             for k_x in 0..p.k_w {
-                crate::cpu::kernels::par_range(0, p.c_out, num_threads, |dst_c_idx| {
+                (0..p.c_out).into_par_iter().for_each(|dst_c_idx| {
                     let k_cont = (0..p.c_in)
                         .map(|c_in_idx| {
                             k[c_in_idx * k_s0 + dst_c_idx * k_s1 + k_y * k_s2 + k_x * k_s3]