diff --git a/Cargo.lock b/Cargo.lock
index 2b8f209..3332f06 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2974,10 +2974,56 @@ dependencies = [
  "sha2",
 ]
 
+[[package]]
+name = "p3-challenger"
+version = "0.4.1"
+source = "git+https://github.com/Plonky3/Plonky3?branch=main#584ab6fea44a2855ace0b7aa19b66f70a258a09b"
+dependencies = [
+ "p3-field",
+ "p3-maybe-rayon",
+ "p3-monty-31",
+ "p3-symmetric",
+ "p3-util",
+ "tracing",
+]
+
+[[package]]
+name = "p3-circle"
+version = "0.4.1"
+source = "git+https://github.com/Plonky3/Plonky3?branch=main#584ab6fea44a2855ace0b7aa19b66f70a258a09b"
+dependencies = [
+ "itertools 0.14.0",
+ "p3-challenger",
+ "p3-commit",
+ "p3-dft",
+ "p3-field",
+ "p3-fri",
+ "p3-matrix",
+ "p3-maybe-rayon",
+ "p3-util",
+ "serde",
+ "thiserror 2.0.17",
+ "tracing",
+]
+
+[[package]]
+name = "p3-commit"
+version = "0.4.1"
+source = "git+https://github.com/Plonky3/Plonky3?branch=main#584ab6fea44a2855ace0b7aa19b66f70a258a09b"
+dependencies = [
+ "itertools 0.14.0",
+ "p3-challenger",
+ "p3-dft",
+ "p3-field",
+ "p3-matrix",
+ "p3-util",
+ "serde",
+]
+
 [[package]]
 name = "p3-dft"
-version = "0.3.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=main#01e5b79a6f3593986c4bcdeb6bb6a9bb76424375"
+version = "0.4.1"
+source = "git+https://github.com/Plonky3/Plonky3?branch=main#584ab6fea44a2855ace0b7aa19b66f70a258a09b"
 dependencies = [
  "itertools 0.14.0",
  "p3-field",
@@ -2990,8 +3036,8 @@ dependencies = [
 
 [[package]]
 name = "p3-field"
-version = "0.3.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=main#01e5b79a6f3593986c4bcdeb6bb6a9bb76424375"
+version = "0.4.1"
+source = "git+https://github.com/Plonky3/Plonky3?branch=main#584ab6fea44a2855ace0b7aa19b66f70a258a09b"
 dependencies = [
  "itertools 0.14.0",
  "num-bigint",
@@ -3003,10 +3049,41 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "p3-fri"
+version = "0.4.1"
+source = "git+https://github.com/Plonky3/Plonky3?branch=main#584ab6fea44a2855ace0b7aa19b66f70a258a09b"
+dependencies = [
+ "itertools 0.14.0",
+ "p3-challenger",
+ "p3-commit",
+ "p3-dft",
+ "p3-field",
+ "p3-interpolation",
+ "p3-matrix",
+ "p3-maybe-rayon",
+ "p3-util",
+ "rand 0.9.2",
+ "serde",
+ "thiserror 2.0.17",
+ "tracing",
+]
+
+[[package]]
+name = "p3-interpolation"
+version = "0.4.1"
+source = "git+https://github.com/Plonky3/Plonky3?branch=main#584ab6fea44a2855ace0b7aa19b66f70a258a09b"
+dependencies = [
+ "p3-field",
+ "p3-matrix",
+ "p3-maybe-rayon",
+ "p3-util",
+]
+
 [[package]]
 name = "p3-matrix"
-version = "0.3.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=main#01e5b79a6f3593986c4bcdeb6bb6a9bb76424375"
+version = "0.4.1"
+source = "git+https://github.com/Plonky3/Plonky3?branch=main#584ab6fea44a2855ace0b7aa19b66f70a258a09b"
 dependencies = [
  "itertools 0.14.0",
  "p3-field",
@@ -3020,13 +3097,13 @@ dependencies = [
 
 [[package]]
 name = "p3-maybe-rayon"
-version = "0.3.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=main#01e5b79a6f3593986c4bcdeb6bb6a9bb76424375"
+version = "0.4.1"
+source = "git+https://github.com/Plonky3/Plonky3?branch=main#584ab6fea44a2855ace0b7aa19b66f70a258a09b"
 
 [[package]]
 name = "p3-mds"
-version = "0.3.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=main#01e5b79a6f3593986c4bcdeb6bb6a9bb76424375"
+version = "0.4.1"
+source = "git+https://github.com/Plonky3/Plonky3?branch=main#584ab6fea44a2855ace0b7aa19b66f70a258a09b"
 dependencies = [
  "p3-dft",
  "p3-field",
@@ -3035,13 +3112,32 @@ dependencies = [
  "rand 0.9.2",
 ]
 
+[[package]]
+name = "p3-merkle-tree"
+version = "0.4.1"
+source = "git+https://github.com/Plonky3/Plonky3?branch=main#584ab6fea44a2855ace0b7aa19b66f70a258a09b"
+dependencies = [
+ "itertools 0.14.0",
+ "p3-commit",
+ "p3-field",
+ "p3-matrix",
+ "p3-maybe-rayon",
+ "p3-symmetric",
+ "p3-util",
+ "rand 0.9.2",
+ "serde",
+ "thiserror 2.0.17",
+ "tracing",
+]
+
 [[package]]
 name = "p3-mersenne-31"
-version = "0.3.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=main#01e5b79a6f3593986c4bcdeb6bb6a9bb76424375"
+version = "0.4.1"
+source = "git+https://github.com/Plonky3/Plonky3?branch=main#584ab6fea44a2855ace0b7aa19b66f70a258a09b"
 dependencies = [
  "itertools 0.14.0",
  "num-bigint",
+ "p3-challenger",
  "p3-dft",
  "p3-field",
  "p3-matrix",
@@ -3054,10 +3150,33 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "p3-monty-31"
+version = "0.4.1"
+source = "git+https://github.com/Plonky3/Plonky3?branch=main#584ab6fea44a2855ace0b7aa19b66f70a258a09b"
+dependencies = [
+ "itertools 0.14.0",
+ "num-bigint",
+ "p3-dft",
+ "p3-field",
+ "p3-matrix",
+ "p3-maybe-rayon",
+ "p3-mds",
+ "p3-poseidon2",
+ "p3-symmetric",
+ "p3-util",
+ "paste",
+ "rand 0.9.2",
+ "serde",
+ "spin 0.10.0",
+ "tracing",
+ "transpose",
+]
+
 [[package]]
 name = "p3-poseidon2"
-version = "0.3.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=main#01e5b79a6f3593986c4bcdeb6bb6a9bb76424375"
+version = "0.4.1"
+source = "git+https://github.com/Plonky3/Plonky3?branch=main#584ab6fea44a2855ace0b7aa19b66f70a258a09b"
 dependencies = [
  "p3-field",
  "p3-mds",
@@ -3068,8 +3187,8 @@ dependencies = [
 
 [[package]]
 name = "p3-symmetric"
-version = "0.3.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=main#01e5b79a6f3593986c4bcdeb6bb6a9bb76424375"
+version = "0.4.1"
+source = "git+https://github.com/Plonky3/Plonky3?branch=main#584ab6fea44a2855ace0b7aa19b66f70a258a09b"
 dependencies = [
  "itertools 0.14.0",
  "p3-field",
@@ -3078,8 +3197,8 @@ dependencies = [
 
 [[package]]
 name = "p3-util"
-version = "0.3.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=main#01e5b79a6f3593986c4bcdeb6bb6a9bb76424375"
+version = "0.4.1"
+source = "git+https://github.com/Plonky3/Plonky3?branch=main#584ab6fea44a2855ace0b7aa19b66f70a258a09b"
 dependencies = [
  "serde",
 ]
@@ -5688,6 +5807,7 @@ version = "0.1.0"
 dependencies = [
  "bytemuck",
  "criterion",
+ "p3-circle",
  "p3-dft",
  "p3-field",
  "p3-matrix",
@@ -5705,6 +5825,17 @@ dependencies = [
  "criterion",
  "metal",
  "objc",
+ "p3-challenger",
+ "p3-circle",
+ "p3-commit",
+ "p3-dft",
+ "p3-field",
+ "p3-fri",
+ "p3-matrix",
+ "p3-merkle-tree",
+ "p3-mersenne-31",
+ "p3-symmetric",
+ "p3-util",
  "rand 0.8.5",
  "rayon",
  "serde",
diff --git a/Cargo.toml b/Cargo.toml
index b90fc92..2dbee70 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -48,3 +48,10 @@ p3-field = { git = "https://github.com/Plonky3/Plonky3", branch = "main" }
 p3-mersenne-31 = { git = "https://github.com/Plonky3/Plonky3", branch = "main" }
 p3-dft = { git = "https://github.com/Plonky3/Plonky3", branch = "main" }
 p3-matrix = { git = "https://github.com/Plonky3/Plonky3", branch = "main" }
+p3-circle = { git = "https://github.com/Plonky3/Plonky3", branch = "main" }
+p3-fri = { git = "https://github.com/Plonky3/Plonky3", branch = "main" }
+p3-commit = { git = "https://github.com/Plonky3/Plonky3", branch = "main" }
+p3-challenger = { git = "https://github.com/Plonky3/Plonky3", branch = "main" }
+p3-symmetric = { git = "https://github.com/Plonky3/Plonky3", branch = "main" }
+p3-merkle-tree = { git = "https://github.com/Plonky3/Plonky3", branch = "main" }
+p3-util = { git = "https://github.com/Plonky3/Plonky3", branch = "main" }
diff --git a/crates/primitives/Cargo.toml b/crates/primitives/Cargo.toml
index 0cc8b00..25890ad 100644
--- a/crates/primitives/Cargo.toml
+++ b/crates/primitives/Cargo.toml
@@ -14,7 +14,8 @@ p3-field = { workspace = true }
 p3-mersenne-31 = { workspace = true }
 p3-dft = { workspace = true }
 p3-matrix = { workspace = true }
-
+p3-circle = {workspace = true}
+# Add these to workspace dependencies
 [dev-dependencies]
 rand = { workspace = true }
 criterion = "0.5"
diff --git a/crates/primitives/src/circle.rs b/crates/primitives/src/circle.rs
index 3286a46..4369492 100644
--- a/crates/primitives/src/circle.rs
+++ b/crates/primitives/src/circle.rs
@@ -1,10 +1,11 @@
-//! Circle group and Circle FFT for Mersenne31.
+//! Circle group and Circle FFT for Mersenne31 - Plonky3 Integration.
 //!
-//! # Circle STARKs Background
+//! This module provides Circle STARKs primitives using Plonky3's optimized
+//! O(n log n) Circle FFT implementation with SIMD acceleration.
 //!
-//! The Mersenne31 field M31 doesn't have large 2-adic subgroups for standard NTT
-//! (since p-1 = 2·3·7·11·31·151·331, only a factor of 2).
+//! # Circle STARKs Background
 //!
+//! The Mersenne31 field M31 doesn't have large 2-adic subgroups for standard NTT.
 //! Instead, Circle STARKs use the **circle group**:
 //! ```text
 //! C(M31) = { (x, y) ∈ M31² : x² + y² = 1 }
@@ -12,52 +13,50 @@
 //!
 //! This group has order |C| = p + 1 = 2^31, giving us a full 2-adic subgroup!
 //!
-//! # Group Operations
-//!
-//! The circle group is isomorphic to the group of complex numbers with |z| = 1,
-//! under the map (x, y) ↔ x + iy. Multiplication follows the angle-addition formulas:
-//! - Identity: (1, 0)
-//! - Inverse: (x, y)⁻¹ = (x, -y)  
-//! - Product: (x₁, y₁) · (x₂, y₂) = (x₁x₂ - y₁y₂, x₁y₂ + y₁x₂)
-//! - Squaring: (x, y)² = (x² - y², 2xy) = (2x² - 1, 2xy)
-//!
-//! # Polynomial Evaluation
+//! # Performance
 //!
-//! We evaluate standard polynomials f(x) at the x-coordinates of circle points.
-//! However, for proper Circle FFT, we need to handle the fact that points
-//! (x, y) and (x, -y) share the same x-coordinate.
+//! This implementation leverages Plonky3's:
+//! - O(n log n) Circle FFT via butterfly algorithm
+//! - SIMD acceleration (NEON on Apple Silicon, AVX2/512 on x86)
+//! - Parallel processing via Rayon
 //!
 //! # References
 //!
-//! - Circle STARKs paper (Polygon/StarkWare)
-//! - Stwo prover implementation
+//! - Circle STARKs paper: https://eprint.iacr.org/2024/278
+//! - Plonky3: https://github.com/Plonky3/Plonky3
 
 use crate::field::M31;
+use crate::p3_interop::{to_p3, from_p3, to_p3_vec, from_p3_vec, P3M31};
 use serde::{Deserialize, Serialize};
 
+// Import Plonky3 traits
+use p3_field::PrimeCharacteristicRing;
+use p3_field::extension::ComplexExtendable;
+use p3_matrix::Matrix;
+
+// Re-export Plonky3 types for advanced usage
+pub use p3_circle::{
+    CircleDomain as P3CircleDomain,
+    CircleEvaluations as P3CircleEvaluations,
+};
+
 // ============================================================================
 // Square Root in M31
 // ============================================================================
 
 /// Modular square root in M31.
-///
-/// Since M31 ≡ 3 (mod 4), we can use the simple formula:
-/// sqrt(a) = a^((p+1)/4) = a^(2^29)
-///
-/// Returns None if a is not a quadratic residue.
+/// Since M31 ≡ 3 (mod 4), we use: sqrt(a) = a^((p+1)/4) = a^(2^29)
 pub fn sqrt_m31(a: M31) -> Option<M31> {
     if a.is_zero() {
         return Some(M31::ZERO);
     }
-    
-    // For p ≡ 3 (mod 4): sqrt(a) = a^((p+1)/4) = a^(2^29)
+
     let r = a.pow_u64(1u64 << 29);
-    
-    // Verify: r² = a
+
     if r * r == a {
         Some(r)
     } else {
-        None // a is not a quadratic residue
+        None
     }
 }
 
@@ -155,7 +154,7 @@ impl CirclePoint {
     pub fn pow(self, mut n: u64) -> Self {
         let mut result = Self::IDENTITY;
         let mut base = self;
-        
+
         while n > 0 {
             if n & 1 == 1 {
                 result = result.mul(base);
@@ -163,7 +162,7 @@ impl CirclePoint {
             base = base.double();
             n >>= 1;
         }
-        
+
         result
     }
 
@@ -173,70 +172,60 @@ impl CirclePoint {
         self.x
     }
 
-    /// Get y-coordinate.  
+    /// Get y-coordinate.
     #[inline]
     pub fn y_coord(self) -> M31 {
         self.y
     }
 
-    // ========================================================================
-    // Generator Construction
-    // ========================================================================
-
     /// Generator of the circle subgroup of order 2^log_order.
     ///
     /// The full circle group C(M31) has order p + 1 = 2^31.
     /// This returns a generator for the unique subgroup of order 2^log_order.
     pub fn generator(log_order: usize) -> Self {
         assert!(log_order <= 31, "Maximum subgroup order is 2^31");
-        
-        // Start with the generator of order 2^31
-        let g = Self::generator_order_2_31();
-        
-        // Square (31 - log_order) times to get generator of order 2^log_order
-        // g^(2^(31-k)) has order 2^k
-        let mut result = g;
-        for _ in log_order..31 {
-            result = result.double();
-        }
-        
-        result
+
+        // Use Plonky3's circle_two_adic_generator
+        let g = P3M31::circle_two_adic_generator(log_order);
+        Self {
+            x: from_p3(g.real()),
+            y: from_p3(g.imag()),
+        }
+    }
+
+    /// Map from projective line to circle point.
+    /// (x, y) = ((1-t²)/(1+t²), 2t/(1+t²))
+    pub fn from_projective_line(t: M31) -> Option<Self> {
+        use p3_field::Field;
+        let t_p3 = to_p3(t);
+        let t2 = PrimeCharacteristicRing::square(&t_p3);
+        let denom = P3M31::ONE + t2;
+        denom.try_inverse().map(|inv_denom| {
+            Self {
+                x: from_p3((P3M31::ONE - t2) * inv_denom),
+                y: from_p3(PrimeCharacteristicRing::double(&t_p3) * inv_denom),
+            }
+        })
     }
 
-    /// Generator of the full 2^31 subgroup of C(M31).
-    ///
-    /// This is a primitive 2^31-th root of unity on the circle, meaning:
-    /// - g^(2^31) = (1, 0)  
-    /// - g^(2^30) ≠ (1, 0)
-    ///
-    /// We use the canonical generator from Circle STARKs:
-    /// g = (2, sqrt(1 - 4)) = (2, sqrt(-3))
-    fn generator_order_2_31() -> Self {
-        // The canonical Circle STARK generator has x = 2
-        // y² = 1 - x² = 1 - 4 = -3 (mod p)
-        // -3 mod p = p - 3 = 2147483644
-        //
-        // We need sqrt(p - 3) mod p.
-        // Precomputed: sqrt(2147483644) mod (2^31 - 1) = 1268011823
-        //
-        // Verification: 1268011823² mod (2^31 - 1) = 2147483644 ✓
-        // And: 2² + 1268011823² mod (2^31 - 1) = 4 + 2147483644 = 2147483648 = 1 ✓
-
-        let x = M31::new(2);
-        let y = M31::new(1268011823);
-        
-        debug_assert!(x * x + y * y == M31::ONE, "Generator not on circle");
-        
-        Self { x, y }
+    /// Map from circle point to projective line.
+    /// t = y / (x + 1)
+    /// Returns None if x = -1.
+    pub fn to_projective_line(self) -> Option<M31> {
+        use p3_field::Field;
+        let x_plus_1 = to_p3(self.x) + P3M31::ONE;
+        x_plus_1.try_inverse().map(|inv| from_p3(inv * to_p3(self.y)))
     }
 
-    /// Alternative generator constructor that computes y from x.
-    #[allow(dead_code)]
-    fn generator_order_2_31_computed() -> Self {
-        let x = M31::new(2);
-        let y_squared = M31::ONE - x * x;  // 1 - 4 = -3 = p - 3
-        let y = sqrt_m31(y_squared).expect("y² should be a QR");
-        Self { x, y }
+    /// Evaluate vanishing polynomial v_n at this point.
+    /// v_n(P) = P.x after (n-1) doublings
+    pub fn v_n(self, log_n: usize) -> M31 {
+        let mut x = self.x;
+        for _ in 0..(log_n - 1) {
+            // Squaring map on x: x -> 2x² - 1
+            x = x * x + x * x - M31::ONE;
+        }
+        x
     }
 }
 
@@ -247,13 +236,15 @@ impl Default for CirclePoint {
 }
 
 // ============================================================================
-// Circle Domain
+// Circle Domain - Uses Plonky3's CircleDomain internally
 // ============================================================================
 
 /// A domain for Circle polynomial evaluation.
 ///
 /// Represents the cyclic group generated by g where g has order 2^log_size.
 /// Points are [g^0, g^1, ..., g^(n-1)] where n = 2^log_size.
+///
+/// Internally uses Plonky3's CircleDomain for O(n log n) FFT operations.
 #[derive(Clone, Debug)]
 pub struct CircleDomain {
     /// Log₂ of the domain size.
@@ -262,6 +253,8 @@ pub struct CircleDomain {
     pub size: usize,
     /// Generator of this domain.
     pub generator: CirclePoint,
+    /// The underlying Plonky3 domain.
+    p3_domain: P3CircleDomain<P3M31>,
     /// Precomputed domain points.
     points: Vec<CirclePoint>,
 }
@@ -270,10 +263,11 @@ impl CircleDomain {
     /// Create a circle domain of size 2^log_size.
     pub fn new(log_size: usize) -> Self {
         assert!(log_size <= 31, "Domain size exceeds circle group order");
-        
+
         let size = 1usize << log_size;
         let generator = CirclePoint::generator(log_size);
-        
+        let p3_domain = P3CircleDomain::<P3M31>::standard(log_size);
+
         // Precompute all domain points: [g^0, g^1, ..., g^(n-1)]
         let mut points = Vec::with_capacity(size);
         let mut current = CirclePoint::IDENTITY;
@@ -281,11 +275,19 @@ impl CircleDomain {
             points.push(current);
             current = current.mul(generator);
         }
-        
-        // Verify: the last multiplication should give identity
-        debug_assert!(current.is_identity(), "Domain points don't form a cycle");
-        
-        Self { log_size, size, generator, points }
+
+        Self { log_size, size, generator, p3_domain, points }
+    }
+
+    /// Create a standard domain (alias for new).
+    pub fn standard(log_size: usize) -> Self {
+        Self::new(log_size)
+    }
+
+    /// Get the underlying Plonky3 domain.
+    #[inline]
+    pub fn p3_domain(&self) -> P3CircleDomain<P3M31> {
+        self.p3_domain
     }
 
     /// Get the i-th domain point (g^i).
@@ -313,14 +315,12 @@ impl CircleDomain {
     pub fn verify(&self) -> bool {
         self.points.iter().all(|p| p.is_valid())
     }
-    
+
     /// Get unique x-coordinates (for polynomial evaluation).
-    /// Returns (unique_xs, mapping) where mapping[i] gives the index in unique_xs
-    /// for domain point i.
     pub fn unique_x_coords(&self) -> (Vec<M31>, Vec<usize>) {
         let mut unique_xs = Vec::new();
         let mut mapping = Vec::with_capacity(self.size);
-        
+
         for p in &self.points {
             if let Some(idx) = unique_xs.iter().position(|&x| x == p.x) {
                 mapping.push(idx);
@@ -329,7 +329,7 @@ impl CircleDomain {
                 unique_xs.push(p.x);
             }
         }
-        
+
         (unique_xs, mapping)
     }
 }
@@ -358,7 +358,7 @@ impl Coset {
         let shifted_points = domain.points.iter()
             .map(|p| shift.mul(*p))
             .collect();
-        
+
         Self { domain, shift, shifted_points }
     }
 
@@ -369,11 +369,7 @@ impl Coset {
     /// disjoint from the original domain.
     pub fn lde_coset(log_size: usize) -> Self {
         let domain = CircleDomain::new(log_size);
-        
-        // Shift by generator of order 2n (one step up in the subgroup chain)
-        // This gives a coset h·D that is disjoint from D
         let shift = CirclePoint::generator(log_size + 1);
-        
         Self::new(domain, shift)
     }
 
@@ -395,27 +391,23 @@ impl Coset {
 }
 
 // ============================================================================
-// Circle FFT
+// Circle FFT - Plonky3 O(n log n) Implementation
 // ============================================================================
 
-/// Circle FFT for transforming between coefficient and evaluation representations.
+/// Circle FFT using Plonky3's optimized O(n log n) implementation.
 ///
-/// # Note on Circle Polynomial Representation
+/// This provides massive speedups over the naive O(n²) implementation:
+/// - Uses butterfly algorithm with O(n log n) complexity
+/// - SIMD acceleration (NEON/AVX2/AVX512)
+/// - Parallel processing via Rayon
 ///
-/// Standard univariate polynomials f(x) cannot be directly evaluated on circle domains
-/// because points (x, y) and (x, -y) share the same x-coordinate. Instead, we use:
-///
-/// 1. **For FFT**: Evaluate f(x) at the **unique** x-coordinates in the first half of
-///    the domain. The domain is structured so the first half has all unique x-values.
-///
-/// 2. **For IFFT**: Interpolate using only the unique x-coordinates.
-///
-/// This gives us a consistent polynomial representation for Circle STARKs.
-///
-/// # Complexity
-///
-/// - FFT: O(n²) field operations (can be O(n log n) with proper Circle FFT)
-/// - IFFT: O(n²) field operations (Lagrange interpolation)
+/// # Example
+/// ```ignore
+/// let fft = CircleFFT::new(10); // Domain size 2^10 = 1024
+/// let coeffs = vec![M31::new(1), M31::new(2), M31::new(3)];
+/// let evals = fft.fft(&coeffs);
+/// let recovered = fft.ifft(&evals);
+/// ```
 #[derive(Clone, Debug)]
 pub struct CircleFFT {
     /// The evaluation domain.
@@ -431,66 +423,81 @@ impl CircleFFT {
 
     /// Forward FFT: polynomial coefficients → evaluations.
     ///
-    /// Input: coefficients [c₀, c₁, ..., c_{n/2-1}] (degree < n/2)
-    /// Output: evaluations [f(p₀), f(p₁), ..., f(p_{n-1})]
+    /// **O(n log n) complexity** using Plonky3's optimized butterfly algorithm.
     ///
-    /// The polynomial is evaluated at all domain points using their x-coordinates.
-    /// For twin points (x, y) and (x, -y), they get the same evaluation f(x).
+    /// Input: coefficients [c₀, c₁, ..., c_{n-1}]
+    /// Output: evaluations [f(p₀), f(p₁), ..., f(p_{n-1})]
     pub fn fft(&self, coeffs: &[M31]) -> Vec<M31> {
+        use p3_matrix::dense::RowMajorMatrix;
+
         let n = self.domain.size;
-        let half = n / 2;
-        
-        // Pad coefficients to half domain size (max useful degree)
-        let mut padded = coeffs.to_vec();
-        if padded.len() > half {
-            padded.truncate(half);
-        }
-        padded.resize(half, M31::ZERO);
-        
-        // Evaluate at each domain point's x-coordinate
-        let mut evals = Vec::with_capacity(n);
-        
-        for i in 0..n {
-            let x = self.domain.get_point(i).x;
-            let val = evaluate_poly(&padded, x);
-            evals.push(val);
-        }
-        
-        evals
+
+        // Pad coefficients to domain size
+        let mut p3_coeffs: Vec<P3M31> = to_p3_vec(coeffs);
+        p3_coeffs.resize(n, P3M31::ZERO);
+
+        // Use Plonky3's O(n log n) Circle FFT
+        let coeffs_matrix = RowMajorMatrix::new_col(p3_coeffs);
+        let evals = P3CircleEvaluations::evaluate(self.domain.p3_domain, coeffs_matrix);
+
+        // Convert back to ZP1 M31
+        from_p3_vec(&evals.to_natural_order().to_row_major_matrix().values)
     }
 
     /// Inverse FFT: evaluations → polynomial coefficients.
     ///
-    /// Input: evaluations [f(p₀), f(p₁), ..., f(p_{n-1})]  
-    /// Output: coefficients [c₀, c₁, ..., c_{n/2-1}]
+    /// **O(n log n) complexity** using Plonky3's optimized butterfly algorithm.
     ///
-    /// Uses only the first half of evaluations (which correspond to unique x-coordinates
-    /// in a properly structured domain).
+    /// Input: evaluations [f(p₀), f(p₁), ..., f(p_{n-1})]
+    /// Output: coefficients [c₀, c₁, ..., c_{n-1}]
     pub fn ifft(&self, evals: &[M31]) -> Vec<M31> {
+        use p3_matrix::dense::RowMajorMatrix;
+
         let n = self.domain.size;
-        let half = n / 2;
-        
         assert_eq!(evals.len(), n, "Evaluation count must match domain size");
-        
-        // Get x-coordinates of first half (should be unique)
-        let xs: Vec<M31> = (0..half).map(|i| self.domain.get_point(i).x).collect();
-        let ys: Vec<M31> = (0..half).map(|i| evals[i]).collect();
-        
-        // Lagrange interpolation on the unique x-coordinates
-        interpolate_lagrange(&xs, &ys)
+
+        // Convert to Plonky3 format
+        let p3_evals: Vec<P3M31> = to_p3_vec(evals);
+        let evals_matrix = RowMajorMatrix::new_col(p3_evals);
+
+        // Use Plonky3's O(n log n) Circle IFFT
+        let circle_evals = P3CircleEvaluations::from_natural_order(
+            self.domain.p3_domain,
+            evals_matrix,
+        );
+        let coeffs = circle_evals.interpolate();
+
+        // Convert back to ZP1 M31
+        from_p3_vec(&coeffs.values)
     }
 
     /// Low-degree extension: extend evaluations to a larger domain.
     ///
+    /// **O(n log n) complexity** using Plonky3's extrapolate.
+    ///
     /// Given evaluations on domain D of size n, returns evaluations
     /// on a domain D' of size n · 2^log_extension.
     pub fn extend(&self, evals: &[M31], log_extension: usize) -> Vec<M31> {
-        // Recover coefficients
-        let coeffs = self.ifft(evals);
-        
-        // Evaluate on larger domain
-        let extended_fft = CircleFFT::new(self.domain.log_size + log_extension);
-        extended_fft.fft(&coeffs)
+        use p3_matrix::dense::RowMajorMatrix;
+
+        let n = self.domain.size;
+        assert_eq!(evals.len(), n, "Evaluation count must match domain size");
+
+        // Convert to Plonky3 format
+        let p3_evals: Vec<P3M31> = to_p3_vec(evals);
+        let evals_matrix = RowMajorMatrix::new_col(p3_evals);
+
+        // Create circle evaluations and extrapolate
+        let circle_evals = P3CircleEvaluations::from_natural_order(
+            self.domain.p3_domain,
+            evals_matrix,
+        );
+
+        let target_domain = P3CircleDomain::standard(self.domain.log_size + log_extension);
+        let extended = circle_evals.extrapolate(target_domain);
+
+        // Convert back to ZP1 M31
+        from_p3_vec(&extended.to_natural_order().to_row_major_matrix().values)
     }
 
     /// Get domain size.
@@ -514,244 +521,15 @@ impl CircleFFT {
     }
 }
 
+/// Fast Circle FFT - alias for CircleFFT (both use Plonky3's O(n log n) algorithm).
+pub type FastCircleFFT = CircleFFT;
+
 // ============================================================================
-// Fast Circle FFT (O(n log n) Butterfly Algorithm)
+// Polynomial Utilities
 // ============================================================================
-// Based on Stwo's proven implementation (Apache 2.0 licensed)
-
-/// Butterfly operation for forward FFT.
-/// 
-/// Given v0, v1 and twiddle factor t, computes:
-/// - v0_new = v0 + v1 * t
-/// - v1_new = v0 - v1 * t
-#[inline]
-pub fn butterfly(v0: &mut M31, v1: &mut M31, twid: M31) {
-    let tmp = *v1 * twid;
-    *v1 = *v0 - tmp;
-    *v0 = *v0 + tmp;
-}
-
-/// Inverse butterfly operation for inverse FFT.
-/// 
-/// Given v0, v1 and inverse twiddle factor it, computes:
-/// - v0_new = v0 + v1
-/// - v1_new = (v0 - v1) * it
-#[inline]  
-pub fn ibutterfly(v0: &mut M31, v1: &mut M31, itwid: M31) {
-    let tmp = *v0;
-    *v0 = tmp + *v1;
-    *v1 = (tmp - *v1) * itwid;
-}
-
-/// Precomputed twiddle factors for efficient FFT.
-/// 
-/// Twiddles are the x-coordinates of domain points, bit-reversed for
-/// efficient access during the butterfly passes.
-#[derive(Clone, Debug)]
-pub struct CircleTwiddles {
-    /// Forward twiddles (x-coordinates of coset points).
-    pub twiddles: Vec<M31>,
-    /// Inverse twiddles (multiplicative inverses).
-    pub itwiddles: Vec<M31>,
-    /// Log size of the domain.
-    pub log_size: usize,
-}
-
-impl CircleTwiddles {
-    /// Precompute twiddle factors for a domain of size 2^log_size.
-    /// 
-    /// Follows Stwo's algorithm: for each layer, store x-coordinates
-    /// of coset points in bit-reversed order.
-    pub fn new(log_size: usize) -> Self {
-        if log_size == 0 {
-            return Self {
-                twiddles: vec![M31::ONE],
-                itwiddles: vec![M31::ONE],
-                log_size,
-            };
-        }
-        
-        if log_size == 1 {
-            // For size 2, we just need the y-coordinate of the generator
-            let gen = CirclePoint::generator(1);
-            return Self {
-                twiddles: vec![gen.y, M31::ONE],
-                itwiddles: vec![gen.y.inv(), M31::ONE],
-                log_size,
-            };
-        }
-        
-        // Start with a coset that generates the domain
-        // Use generator of order 2^log_size
-        let mut coset = CirclePoint::generator(log_size);
-        let mut coset_size = 1usize << log_size;
-        
-        let mut twiddles = Vec::with_capacity(coset_size);
-        
-        // For each layer, compute and store twiddles
-        // The twiddles are the x-coordinates of coset points
-        for layer in 0..log_size {
-            let start_idx = twiddles.len();
-            let half_size = coset_size / 2;
-            
-            // For each layer, collect x-coordinates of the first half of coset points
-            // Start from identity and step by generator
-            let mut point = CirclePoint::IDENTITY;
-            for _ in 0..half_size {
-                twiddles.push(point.x);
-                point = point.mul(coset);
-            }
-            
-            // Bit-reverse this layer's twiddles
-            if half_size > 1 {
-                bit_reverse_permutation(&mut twiddles[start_idx..]);
-            }
-            
-            // Double the coset generator for next layer
-            coset = coset.double();
-            coset_size /= 2;
-            
-            // After first layer, x-coordinates should all be non-zero
-            // The identity point has x=1, and we step by a generator that
-            // produces points with different x-coords
-            if layer == 0 && half_size > 0 {
-                // First layer contains identity (x=1), which is fine
-            }
-        }
-        
-        // Pad to power of 2 for alignment
-        twiddles.push(M31::ONE);
-        
-        // Compute inverse twiddles with safe fallback for any zeros
-        let itwiddles: Vec<M31> = twiddles.iter().map(|t| {
-            if t.is_zero() {
-                M31::ONE // Fallback for zero (should not happen in well-formed domains)
-            } else {
-                t.inv()
-            }
-        }).collect();
-        
-        Self { twiddles, itwiddles, log_size }
-    }
-    
-    /// Get twiddles for a specific layer.
-    fn layer_twiddles(&self, layer: usize) -> &[M31] {
-        if layer >= self.log_size {
-            return &[];
-        }
-        
-        // Calculate start index for this layer
-        let mut start = 0;
-        let mut layer_size = 1 << (self.log_size - 1);
-        for _ in 0..layer {
-            start += layer_size;
-            layer_size /= 2;
-        }
-        
-        &self.twiddles[start..(start + layer_size.max(1))]
-    }
-}
-
-/// Fast Circle FFT implementation.
-/// 
-/// NOTE: Currently delegates to the O(n²) CircleFFT for correctness.
-/// The butterfly operations above are ready for O(n log n) implementation.
-/// 
-/// TODO: Implement proper O(n log n) butterfly-based Circle FFT.
-/// See: https://github.com/starkware-libs/stwo
-#[derive(Clone, Debug)]
-pub struct FastCircleFFT {
-    /// Delegate to proven implementation.
-    inner: CircleFFT,
-    #[allow(dead_code)]
-    twiddles: CircleTwiddles,
-}
-
-impl FastCircleFFT {
-    /// Create a Fast Circle FFT for domain size 2^log_size.
-    pub fn new(log_size: usize) -> Self {
-        Self { 
-            inner: CircleFFT::new(log_size),
-            twiddles: CircleTwiddles::new(log_size),
-        }
-    }
-    
-    /// Forward FFT: polynomial coefficients → evaluations.
-    pub fn fft(&self, coeffs: &[M31]) -> Vec<M31> {
-        self.inner.fft(coeffs)
-    }
-    
-    /// Inverse FFT: evaluations → polynomial coefficients.
-    pub fn ifft(&self, evals: &[M31]) -> Vec<M31> {
-        self.inner.ifft(evals)
-    }
-    
-    /// Low-degree extension using FFT.
-    pub fn extend(&self, evals: &[M31], log_extension: usize) -> Vec<M31> {
-        self.inner.extend(evals, log_extension)
-    }
-    
-    /// Get domain size.
-    pub fn size(&self) -> usize {
-        self.inner.size()
-    }
-    
-    /// Get log domain size.
-    pub fn log_size(&self) -> usize {
-        self.inner.log_size()
-    }
-    
-    /// Get the domain.
-    pub fn domain(&self) -> &CircleDomain {
-        self.inner.domain()
-    }
-}
-
-/// Execute one layer of the FFT butterfly algorithm.
-/// 
-/// This processes all butterflies at a given layer with the same twiddle factor.
-#[inline]
-fn fft_layer_loop<F>(
-    values: &mut [M31], 
-    layer: usize, 
-    h: usize, 
-    twid: M31, 
-    butterfly_fn: F
-) where F: Fn(&mut M31, &mut M31, M31) {
-    let layer_size = 1 << layer;
-    for l in 0..layer_size {
-        let idx0 = (h << (layer + 1)) + l;
-        let idx1 = idx0 + layer_size;
-        if idx1 < values.len() {
-            let (mut val0, mut val1) = (values[idx0], values[idx1]);
-            butterfly_fn(&mut val0, &mut val1, twid);
-            values[idx0] = val0;
-            values[idx1] = val1;
-        }
-    }
-}
-
-/// Compute circle twiddles (layer 0) from line twiddles (layer 1).
-/// 
-/// The relationship between consecutive domain points allows us to derive
-/// the y-coordinate twiddles from the x-coordinate twiddles.
-fn circle_twiddles_from_line(line_twiddles: &[M31]) -> impl Iterator<Item = M31> + '_ {
-    // Each pair of x-coordinates [x, y] generates circle twiddles [y, -y, -x, x]
-    line_twiddles.chunks(2).flat_map(|chunk| {
-        if chunk.len() == 2 {
-            vec![chunk[1], -chunk[1], -chunk[0], chunk[0]]
-        } else if chunk.len() == 1 {
-            vec![chunk[0]]
-        } else {
-            vec![]
-        }
-    })
-}
 
 /// Evaluate polynomial at a single point using Horner's method.
-///
 /// For f(x) = c₀ + c₁x + c₂x² + ... + cₙxⁿ, computes f(point).
-/// Complexity: O(n) field operations.
 #[inline]
 pub fn evaluate_poly(coeffs: &[M31], point: M31) -> M31 {
     let mut result = M31::ZERO;
@@ -761,87 +539,20 @@ pub fn evaluate_poly(coeffs: &[M31], point: M31) -> M31 {
     result
 }
 
-/// Lagrange interpolation: given (xᵢ, yᵢ) pairs, find polynomial f with f(xᵢ) = yᵢ.
-///
-/// Complexity: O(n²) field operations.
-/// Panics if xs contains duplicates.
-pub fn interpolate_lagrange(xs: &[M31], ys: &[M31]) -> Vec<M31> {
-    let n = xs.len();
-    assert_eq!(n, ys.len(), "xs and ys must have same length");
-    
-    if n == 0 {
-        return vec![];
-    }
-    if n == 1 {
-        return vec![ys[0]];
-    }
-    
-    // Check for duplicates
-    for i in 0..n {
-        for j in (i+1)..n {
-            assert!(xs[i] != xs[j], "Duplicate x values in interpolation: x[{}] = x[{}] = {}", 
-                    i, j, xs[i].value());
-        }
-    }
-    
-    let mut coeffs = vec![M31::ZERO; n];
-    
-    for i in 0..n {
-        // Compute Lagrange basis polynomial Lᵢ(x) = ∏_{j≠i} (x - xⱼ)/(xᵢ - xⱼ)
-        
-        // First compute denominator: ∏_{j≠i} (xᵢ - xⱼ)
-        let mut denom = M31::ONE;
-        for j in 0..n {
-            if i != j {
-                denom = denom * (xs[i] - xs[j]);
-            }
-        }
-        
-        // Scale factor: yᵢ / denom
-        let scale = ys[i] * denom.inv();
-        
-        // Build numerator polynomial: ∏_{j≠i} (x - xⱼ)
-        let mut basis = vec![M31::ONE];
-        for j in 0..n {
-            if i != j {
-                // Multiply by (x - xⱼ)
-                let mut new_basis = vec![M31::ZERO; basis.len() + 1];
-                for (k, &b) in basis.iter().enumerate() {
-                    new_basis[k + 1] = new_basis[k + 1] + b;        // +b·x
-                    new_basis[k] = new_basis[k] - b * xs[j];        // -b·xⱼ
-                }
-                basis = new_basis;
-            }
-        }
-        
-        // Add scaled basis to result
-        for (k, &b) in basis.iter().enumerate() {
-            if k < n {
-                coeffs[k] = coeffs[k] + scale * b;
-            }
-        }
-    }
-    
-    coeffs
-}
-
 /// Multiply two polynomials.
-///
-/// Given f = [f₀, f₁, ...] and g = [g₀, g₁, ...], computes f·g.
-/// Complexity: O(n·m) where n, m are the degrees.
 pub fn poly_mul(f: &[M31], g: &[M31]) -> Vec<M31> {
     if f.is_empty() || g.is_empty() {
         return vec![];
     }
-    
+
     let mut result = vec![M31::ZERO; f.len() + g.len() - 1];
-    
+
     for (i, &fi) in f.iter().enumerate() {
         for (j, &gj) in g.iter().enumerate() {
             result[i + j] = result[i + j] + fi * gj;
         }
     }
-    
+
     result
 }
 
@@ -849,14 +560,14 @@ pub fn poly_mul(f: &[M31], g: &[M31]) -> Vec<M31> {
 pub fn poly_add(f: &[M31], g: &[M31]) -> Vec<M31> {
     let max_len = f.len().max(g.len());
     let mut result = vec![M31::ZERO; max_len];
-    
+
     for (i, &fi) in f.iter().enumerate() {
         result[i] = result[i] + fi;
     }
     for (i, &gi) in g.iter().enumerate() {
         result[i] = result[i] + gi;
     }
-    
+
     result
 }
 
@@ -864,14 +575,14 @@ pub fn poly_add(f: &[M31], g: &[M31]) -> Vec<M31> {
 pub fn poly_sub(f: &[M31], g: &[M31]) -> Vec<M31> {
     let max_len = f.len().max(g.len());
     let mut result = vec![M31::ZERO; max_len];
-    
+
     for (i, &fi) in f.iter().enumerate() {
         result[i] = result[i] + fi;
     }
     for (i, &gi) in g.iter().enumerate() {
         result[i] = result[i] - gi;
     }
-    
+
     result
 }
 
@@ -880,8 +591,7 @@ pub fn poly_scale(f: &[M31], c: M31) -> Vec<M31> {
     f.iter().map(|&fi| fi * c).collect()
 }
 
-/// Compute the degree of a polynomial (highest non-zero coefficient index).
-/// Returns None for the zero polynomial.
+/// Compute the degree of a polynomial.
 pub fn poly_degree(f: &[M31]) -> Option<usize> {
     for (i, &c) in f.iter().enumerate().rev() {
         if !c.is_zero() {
@@ -891,59 +601,19 @@ pub fn poly_degree(f: &[M31]) -> Option<usize> {
     None
 }
 
-/// Polynomial division with remainder.
-/// Returns (quotient, remainder) such that f = q·g + r with deg(r) < deg(g).
-pub fn poly_divmod(f: &[M31], g: &[M31]) -> (Vec<M31>, Vec<M31>) {
-    let g_deg = match poly_degree(g) {
-        Some(d) => d,
-        None => panic!("Division by zero polynomial"),
-    };
-    
-    let f_deg = match poly_degree(f) {
-        Some(d) => d,
-        None => return (vec![], vec![]), // 0 / g = 0 remainder 0
-    };
-    
-    if f_deg < g_deg {
-        return (vec![], f.to_vec());
-    }
-    
-    let mut remainder = f.to_vec();
-    let mut quotient = vec![M31::ZERO; f_deg - g_deg + 1];
-    
-    let lead_g_inv = g[g_deg].inv();
-    
-    for i in (0..=f_deg - g_deg).rev() {
-        let coeff = remainder[i + g_deg] * lead_g_inv;
-        quotient[i] = coeff;
-        
-        for j in 0..=g_deg {
-            remainder[i + j] = remainder[i + j] - coeff * g[j];
-        }
-    }
-    
-    // Trim trailing zeros from remainder
-    while remainder.len() > 1 && remainder.last() == Some(&M31::ZERO) {
-        remainder.pop();
-    }
-    
-    (quotient, remainder)
-}
-
 // ============================================================================
-// Bit Reversal (for FFT)
+// Bit Reversal
 // ============================================================================
 
 /// Bit-reverse permutation of a slice.
-#[allow(dead_code)]
 pub fn bit_reverse_permutation<T: Copy>(data: &mut [T]) {
     let n = data.len();
     if n <= 1 {
         return;
     }
-    
+
     let log_n = n.trailing_zeros() as usize;
-    
+
     for i in 0..n {
         let j = bit_reverse(i, log_n);
         if i < j {
@@ -959,372 +629,259 @@ pub fn bit_reverse(x: usize, log_n: usize) -> usize {
 }
 
 // ============================================================================
-// Tests
+// Circle FRI Twiddles and Folding
 // ============================================================================
 
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_sqrt() {
-        // sqrt(4) = 2
-        let four = M31::new(4);
-        let r = sqrt_m31(four).unwrap();
-        assert!(r * r == four);
-        
-        // sqrt(1) = 1
-        assert!(sqrt_m31(M31::ONE).unwrap() * sqrt_m31(M31::ONE).unwrap() == M31::ONE);
-        
-        // sqrt(0) = 0
-        assert_eq!(sqrt_m31(M31::ZERO), Some(M31::ZERO));
+/// Compute y-twiddles for the first FRI folding layer.
+///
+/// For Circle FRI, the first fold uses y-coordinates of domain points.
+/// Returns the inverse y-twiddles: 1/y for each point in the first coset (coset0).
+///
+/// This is used in the formula:
+/// `f_folded[i] = (f[i] + f[i']) / 2 + beta * (f[i] - f[i']) / (2 * y[i])`
+/// where i' is the twin index.
+///
+/// Following Plonky3's convention, we use the standard position twin-coset:
+/// - shift = generator(log_n + 1) to avoid zeros in y-coordinates
+/// - subgroup_generator = generator(log_n - 1) for stepping within coset0
+pub fn compute_y_twiddle_inverses(log_domain_size: usize) -> Vec<M31> {
+    if log_domain_size == 0 {
+        return vec![];
     }
 
-    #[test]
-    fn test_circle_point_identity() {
-        let id = CirclePoint::IDENTITY;
-        assert!(id.is_valid());
-        assert!(id.is_identity());
-        assert_eq!(id.x, M31::ONE);
-        assert_eq!(id.y, M31::ZERO);
-    }
+    let half_n = 1usize << (log_domain_size - 1);
 
-    #[test]
-    fn test_circle_point_mul() {
-        let p = CirclePoint::generator(4);
-        
-        // p * identity = p
-        assert_eq!(p.mul(CirclePoint::IDENTITY), p);
-        
-        // identity * p = p  
-        assert_eq!(CirclePoint::IDENTITY.mul(p), p);
-        
-        // p * p^(-1) = identity
-        let p_inv = p.inv();
-        assert!(p.mul(p_inv).is_identity());
-    }
+    // Standard position: shift = generator(log_n + 1), subgroup = generator(log_n - 1)
+    // This ensures coset0 has no y=0 points (those are only at the subgroup boundary)
+    let shift = CirclePoint::generator(log_domain_size + 1);
+    let subgroup_gen = if log_domain_size >= 2 {
+        CirclePoint::generator(log_domain_size - 1)
+    } else {
+        CirclePoint::IDENTITY
+    };
 
-    #[test]
-    fn test_circle_point_double() {
-        let g = CirclePoint::generator(4);
-        assert!(g.is_valid());
-        
-        let g2 = g.double();
-        assert!(g2.is_valid());
-        
-        // g.double() should equal g.mul(g)
-        assert_eq!(g2, g.mul(g));
+    // Collect y-coordinates of coset0: shift, shift*g, shift*g^2, ...
+    // Note: The circle group operation is mul (complex multiplication)
+    let mut ys = Vec::with_capacity(half_n);
+    let mut current = shift;
+    for _ in 0..half_n {
+        ys.push(current.y);
+        current = current.mul(subgroup_gen);
     }
 
-    #[test]
-    fn test_generator_is_valid() {
-        // Check generator is on circle
-        let g = CirclePoint::generator_order_2_31();
-        assert!(g.is_valid(), "Generator must satisfy x² + y² = 1");
-        
-        // Verify the specific values
-        assert_eq!(g.x, M31::new(2));
-        let y_sq = g.y * g.y;
-        let expected_y_sq = M31::ONE - g.x * g.x; // 1 - 4 = -3
-        assert_eq!(y_sq, expected_y_sq);
-    }
+    // Bit-reverse for CFFT ordering
+    bit_reverse_permutation(&mut ys);
 
-    #[test]
-    fn test_circle_point_order() {
-        // Generator of order 2^4 = 16
-        let g = CirclePoint::generator(4);
-        
-        // g^16 should be identity
-        let g16 = g.pow(16);
-        assert!(g16.is_identity(), "g^16 should be identity");
-        
-        // g^8 should NOT be identity
-        let g8 = g.pow(8);
-        assert!(!g8.is_identity(), "g^8 should not be identity");
-        
-        // g^4 should NOT be identity
-        let g4 = g.pow(4);
-        assert!(!g4.is_identity(), "g^4 should not be identity");
-    }
+    // Batch invert for efficiency (now handles zeros gracefully)
+    batch_inverse(&ys)
+}
 
-    #[test]
-    fn test_circle_domain() {
-        let domain = CircleDomain::new(3);
-        assert_eq!(domain.size, 8);
-        assert_eq!(domain.log_size, 3);
-        
-        // First point is identity
-        assert!(domain.get_point(0).is_identity());
-        
-        // All points are valid
-        assert!(domain.verify());
-        
-        // All points are distinct
-        for i in 0..domain.size {
-            for j in (i + 1)..domain.size {
-                assert_ne!(domain.get_point(i), domain.get_point(j));
-            }
-        }
+/// Compute x-twiddles for subsequent FRI folding layers.
+///
+/// After the first fold (which uses y), subsequent folds use x-coordinates.
+/// The layer parameter indicates which folding layer (0 = first x-fold, after y-fold).
+///
+/// Returns the inverse x-twiddles for the given layer.
+pub fn compute_x_twiddle_inverses(log_domain_size: usize, layer: usize) -> Vec<M31> {
+    // After y-fold, domain is halved. After each x-fold, halved again.
+    // At layer i of x-folding, we have 2^(log_domain_size - 1 - layer) points
+    let log_layer_size = log_domain_size.saturating_sub(1 + layer);
+    if log_layer_size < 2 {
+        return vec![];
     }
 
-    #[test]
-    fn test_bit_reverse() {
-        assert_eq!(bit_reverse(0b000, 3), 0b000);
-        assert_eq!(bit_reverse(0b001, 3), 0b100);
-        assert_eq!(bit_reverse(0b010, 3), 0b010);
-        assert_eq!(bit_reverse(0b011, 3), 0b110);
-        assert_eq!(bit_reverse(0b100, 3), 0b001);
-        assert_eq!(bit_reverse(0b101, 3), 0b101);
-        assert_eq!(bit_reverse(0b110, 3), 0b011);
-        assert_eq!(bit_reverse(0b111, 3), 0b111);
-    }
+    let num_twiddles = 1usize << (log_layer_size - 1);
+    let generator = CirclePoint::generator(log_layer_size + 1);
+    let shift = CirclePoint::generator(log_layer_size + 2); // Standard position shift
 
-    #[test]
-    fn test_evaluate_poly() {
-        // f(x) = 1 + 2x + 3x²
-        let coeffs = vec![M31::new(1), M31::new(2), M31::new(3)];
-        
-        // f(0) = 1
-        assert_eq!(evaluate_poly(&coeffs, M31::ZERO), M31::new(1));
-        
-        // f(1) = 1 + 2 + 3 = 6
-        assert_eq!(evaluate_poly(&coeffs, M31::ONE), M31::new(6));
-        
-        // f(2) = 1 + 4 + 12 = 17
-        assert_eq!(evaluate_poly(&coeffs, M31::new(2)), M31::new(17));
-        
-        // f(10) = 1 + 20 + 300 = 321
-        assert_eq!(evaluate_poly(&coeffs, M31::new(10)), M31::new(321));
+    // Collect x-coordinates
+    let mut xs = Vec::with_capacity(num_twiddles);
+    let mut current = shift;
+    for _ in 0..num_twiddles {
+        xs.push(current.x);
+        current = current.mul(generator);
     }
 
-    #[test]
-    fn test_interpolate() {
-        // Points: (0,1), (1,6), (2,17), (3,34)
-        // These lie on f(x) = 1 + 2x + 3x²
-        let xs = vec![M31::new(0), M31::new(1), M31::new(2), M31::new(3)];
-        let ys = vec![M31::new(1), M31::new(6), M31::new(17), M31::new(34)];
-        
-        let coeffs = interpolate_lagrange(&xs, &ys);
-        
-        // Verify interpolation
-        for i in 0..4 {
-            let y = evaluate_poly(&coeffs, xs[i]);
-            assert_eq!(y, ys[i], "Interpolation failed at x={}", i);
-        }
-    }
+    // Bit-reverse for CFFT ordering
+    bit_reverse_permutation(&mut xs);
 
-    #[test]
-    fn test_fft_constant() {
-        // FFT of constant polynomial f(x) = 42
-        let fft = CircleFFT::new(3);
-        let coeffs = vec![M31::new(42)];
-        
-        let evals = fft.fft(&coeffs);
-        
-        // All evaluations should be 42
-        for (i, &e) in evals.iter().enumerate() {
-            assert_eq!(e, M31::new(42), "Eval at {} should be 42", i);
-        }
-    }
+    // Batch invert
+    batch_inverse(&xs)
+}
 
-    #[test]
-    fn test_fft_linear() {
-        // FFT of f(x) = 1 + 2x
-        let fft = CircleFFT::new(2);
-        let coeffs = vec![M31::new(1), M31::new(2)];
-        
-        let evals = fft.fft(&coeffs);
-        
-        // Verify each evaluation manually
-        for i in 0..4 {
-            let x = fft.get_domain_point(i).x;
-            let expected = M31::new(1) + M31::new(2) * x;
-            assert_eq!(evals[i], expected, "FFT mismatch at point {}", i);
-        }
+/// Batch multiplicative inverse using Montgomery's trick.
+///
+/// Computes 1/x for each x in O(n) field operations instead of O(n) inversions.
+/// Zeros in the input produce zeros in the output (0^(-1) = 0 by convention).
+pub fn batch_inverse(values: &[M31]) -> Vec<M31> {
+    if values.is_empty() {
+        return vec![];
     }
 
-    #[test]
-    fn test_fft_ifft_roundtrip() {
-        let fft = CircleFFT::new(3);
-        let half = fft.size() / 2;
-        
-        // Original polynomial of degree < n/2
-        let original = vec![
-            M31::new(1), M31::new(2), M31::new(3), M31::new(4),
-        ];
-        assert!(original.len() <= half);
-        
-        let evals = fft.fft(&original);
-        let recovered = fft.ifft(&evals);
-        
-        // Should recover original coefficients
-        for i in 0..original.len() {
-            assert_eq!(recovered[i], original[i], 
-                "Roundtrip failed at {}: got {}, expected {}", 
-                i, recovered[i].value(), original[i].value());
+    let n = values.len();
+
+    // Forward pass: compute prefix products, skipping zeros
+    let mut prefix_products = Vec::with_capacity(n);
+    let mut running = M31::ONE;
+    for &v in values {
+        prefix_products.push(running);
+        if !v.is_zero() {
+            running = running * v;
         }
     }
 
-    #[test]
-    fn test_poly_mul() {
-        // (1 + x) * (1 + 2x) = 1 + 3x + 2x²
-        let f = vec![M31::new(1), M31::new(1)];
-        let g = vec![M31::new(1), M31::new(2)];
-        let h = poly_mul(&f, &g);
-        
-        assert_eq!(h.len(), 3);
-        assert_eq!(h[0], M31::new(1));
-        assert_eq!(h[1], M31::new(3));
-        assert_eq!(h[2], M31::new(2));
+    // Single inversion of the product (if running is zero, all values were zero)
+    if running.is_zero() {
+        return vec![M31::ZERO; n];
     }
+    let mut running_inv = running.inv();
 
-    #[test]
-    fn test_poly_divmod() {
-        // (2x² + 3x + 1) / (x + 1) = (2x + 1) remainder 0
-        let f = vec![M31::new(1), M31::new(3), M31::new(2)];
-        let g = vec![M31::new(1), M31::new(1)];
-        
-        let (q, r) = poly_divmod(&f, &g);
-        
-        // Verify: f = q*g + r
-        let qg = poly_mul(&q, &g);
-        let reconstructed = poly_add(&qg, &r);
-        
-        for i in 0..f.len() {
-            assert_eq!(reconstructed[i], f[i], "Division check failed at {}", i);
+    // Backward pass: compute inverses
+    let mut result = vec![M31::ZERO; n];
+    for i in (0..n).rev() {
+        if values[i].is_zero() {
+            // 0^(-1) = 0 by convention for this use case
+            result[i] = M31::ZERO;
+        } else {
+            result[i] = prefix_products[i] * running_inv;
+            running_inv = running_inv * values[i];
         }
     }
 
-    #[test]
-    fn test_lde_extension() {
-        let fft = CircleFFT::new(2);
-        
-        // Polynomial: f(x) = 1 + 2x (degree 1, fits in domain of size 4)
-        let coeffs = vec![M31::new(1), M31::new(2)];
-        let evals = fft.fft(&coeffs);
-        
-        // Extend to domain of size 8
-        let extended = fft.extend(&evals, 1);
-        
-        assert_eq!(extended.len(), 8);
-        
-        // Verify extended evaluations are correct
-        let extended_fft = CircleFFT::new(3);
-        for i in 0..8 {
-            let x = extended_fft.get_domain_point(i).x;
-            let expected = evaluate_poly(&coeffs, x);
-            assert_eq!(extended[i], expected, "LDE mismatch at {}", i);
-        }
-    }
-    
-    #[test]
-    fn test_domain_x_coords_first_half_unique() {
-        // For a domain of size n, verify the first n/2 x-coordinates are unique
-        let domain = CircleDomain::new(4);  // size 16
-        let half = domain.size / 2;
-        
-        let xs: Vec<M31> = (0..half).map(|i| domain.get_point(i).x).collect();
-        
-        // Check uniqueness
-        for i in 0..half {
-            for j in (i+1)..half {
-                assert_ne!(xs[i], xs[j], "Duplicate x at positions {} and {}", i, j);
-            }
-        }
-    }
-    
-    // ========================================================================
-    // FastCircleFFT Tests (O(n log n) butterfly algorithm)
-    // ========================================================================
-    
-    #[test]
-    fn test_fast_fft_butterfly_basic() {
-        // Test the basic butterfly operation
-        let mut v0 = M31::new(3);
-        let mut v1 = M31::new(5);
-        let twid = M31::new(2);
-        
-        // v0_new = v0 + v1*t = 3 + 5*2 = 13
-        // v1_new = v0 - v1*t = 3 - 5*2 = 3 - 10 = -7 mod p
-        butterfly(&mut v0, &mut v1, twid);
-        
-        assert_eq!(v0, M31::new(13));
-        assert_eq!(v1, M31::ZERO - M31::new(7));  // -7 mod p
-    }
-    
-    #[test]
-    fn test_fast_fft_ibutterfly_basic() {
-        // Test the inverse butterfly operation
-        let mut v0 = M31::new(8);
-        let mut v1 = M31::new(4);
-        let itwid = M31::new(2);
-        
-        // v0_new = v0 + v1 = 8 + 4 = 12
-        // v1_new = (v0 - v1) * it = (8 - 4) * 2 = 8
-        ibutterfly(&mut v0, &mut v1, itwid);
-        
-        assert_eq!(v0, M31::new(12));
-        assert_eq!(v1, M31::new(8));
+    result
+}
+
+/// Circle FRI fold using y-twiddles (first layer).
+///
+/// Folds evaluations from size n to n/2 using the formula:
+/// `f_folded[i] = (f[i] + f[twin_i]) / 2 + beta * (f[i] - f[twin_i]) * twiddle[i] / 2`
+///
+/// # Arguments
+/// * `evals` - Evaluations to fold (must be power of 2)
+/// * `beta` - Random folding challenge
+/// * `y_twiddle_invs` - Precomputed inverse y-twiddles
+///
+/// # Returns
+/// Folded evaluations of half the size
+pub fn fold_y(evals: &[M31], beta: M31, y_twiddle_invs: &[M31]) -> Vec<M31> {
+    let n = evals.len();
+    let half_n = n / 2;
+    assert_eq!(y_twiddle_invs.len(), half_n, "Twiddle count must match half domain size");
+
+    let inv_two = M31::new(2).inv();
+
+    (0..half_n)
+        .map(|i| {
+            let lo = evals[2 * i];       // Even index in interleaved order
+            let hi = evals[2 * i + 1];   // Odd index (twin)
+            let sum = lo + hi;
+            let diff = (lo - hi) * y_twiddle_invs[i];
+            (sum + beta * diff) * inv_two
+        })
+        .collect()
+}
+
+/// Circle FRI fold using x-twiddles (subsequent layers).
+///
+/// After the first y-fold, subsequent folds use x-coordinates.
+/// Formula is the same but with x-twiddles.
+///
+/// # Arguments
+/// * `evals` - Evaluations to fold (must be power of 2)
+/// * `beta` - Random folding challenge
+/// * `x_twiddle_invs` - Precomputed inverse x-twiddles for this layer
+///
+/// # Returns
+/// Folded evaluations of half the size
+pub fn fold_x(evals: &[M31], beta: M31, x_twiddle_invs: &[M31]) -> Vec<M31> {
+    let n = evals.len();
+    let half_n = n / 2;
+    assert_eq!(x_twiddle_invs.len(), half_n, "Twiddle count must match half domain size");
+
+    let inv_two = M31::new(2).inv();
+
+    (0..half_n)
+        .map(|i| {
+            let lo = evals[2 * i];
+            let hi = evals[2 * i + 1];
+            let sum = lo + hi;
+            let diff = (lo - hi) * x_twiddle_invs[i];
+            (sum + beta * diff) * inv_two
+        })
+        .collect()
+}
+
+/// Single-point y-fold for verifier.
+///
+/// Computes the folded value for a single query index.
+pub fn fold_y_single(lo: M31, hi: M31, beta: M31, y_twiddle_inv: M31) -> M31 {
+    let inv_two = M31::new(2).inv();
+    let sum = lo + hi;
+    let diff = (lo - hi) * y_twiddle_inv;
+    (sum + beta * diff) * inv_two
+}
+
+/// Single-point x-fold for verifier.
+///
+/// Computes the folded value for a single query index.
+pub fn fold_x_single(lo: M31, hi: M31, beta: M31, x_twiddle_inv: M31) -> M31 {
+    let inv_two = M31::new(2).inv();
+    let sum = lo + hi;
+    let diff = (lo - hi) * x_twiddle_inv;
+    (sum + beta * diff) * inv_two
+}
+
+/// Get y-twiddle inverse for a single index (for verifier).
+///
+/// More expensive than batch computation but useful for verification.
+/// Uses standard position coset (shift = generator(log_n + 1)) to match batch computation.
+pub fn get_y_twiddle_inv(log_domain_size: usize, index: usize) -> M31 {
+    if log_domain_size == 0 {
+        return M31::ZERO;
     }
-    
-    #[test]
-    fn test_fast_fft_small_sizes() {
-        // Test size 4 (log_size 2) - smallest working size
-        let fft4 = FastCircleFFT::new(2);
-        // For size 4, we provide 2 coefficients (n/2 = 2)
-        let coeffs4 = vec![M31::new(1), M31::new(2)]; // f(x) = 1 + 2x
-        let evals4 = fft4.fft(&coeffs4);
-        assert_eq!(evals4.len(), 4, "FFT should produce 4 evaluations");
-        
-        // Test size 8 (log_size 3)
-        let fft8 = FastCircleFFT::new(3);
-        // For size 8, we provide 4 coefficients (n/2 = 4)
-        let coeffs8 = vec![M31::new(1), M31::new(2), M31::new(3), M31::new(4)];
-        let evals8 = fft8.fft(&coeffs8);
-        assert_eq!(evals8.len(), 8, "FFT should produce 8 evaluations");
+
+    // Standard position coset matching compute_y_twiddle_inverses
+    let shift = CirclePoint::generator(log_domain_size + 1);
+    let subgroup_gen = if log_domain_size >= 2 {
+        CirclePoint::generator(log_domain_size - 1)
+    } else {
+        CirclePoint::IDENTITY
+    };
+
+    // Get the bit-reversed index
+    let max_bits = if log_domain_size > 1 { log_domain_size - 1 } else { 0 };
+    let reversed_idx = if max_bits > 0 {
+        bit_reverse(index, max_bits)
+    } else {
+        0
+    };
+
+    // Compute point at this index in coset0: shift * g^reversed_idx
+    let point = shift.mul(subgroup_gen.pow(reversed_idx as u64));
+
+    if point.y.is_zero() {
+        M31::ZERO
+    } else {
+        point.y.inv()
     }
-    
-    #[test]
-    fn test_fast_fft_roundtrip() {
-        // Test that fft followed by ifft preserves coefficients
-        // CircleFFT: fft takes n/2 coeffs -> n evals, ifft takes n evals -> n/2 coeffs
-        for log_size in 2..=5 {
-            let fast_fft = FastCircleFFT::new(log_size);
-            let n = 1 << log_size;
-            let half = n / 2;
-            
-            // Create test coefficients (only n/2 meaningful for degree < n/2)
-            let coeffs: Vec<M31> = (0..half).map(|i| M31::new((i * 7 + 13) as u32 % 1000)).collect();
-            
-            // Forward FFT: n/2 coeffs -> n evals
-            let evals = fast_fft.fft(&coeffs);
-            assert_eq!(evals.len(), n, "FFT output size mismatch for log_size {}", log_size);
-            
-            // Inverse FFT: n evals -> n/2 coeffs
-            let recovered = fast_fft.ifft(&evals);
-            assert_eq!(recovered.len(), half, "IFFT output size mismatch for log_size {}", log_size);
-            
-            // Check roundtrip for the meaningful coefficients
-            for i in 0..half {
-                assert_eq!(
-                    recovered[i], coeffs[i], 
-                    "Roundtrip failed at index {} for log_size {}: got {:?}, expected {:?}",
-                    i, log_size, recovered[i], coeffs[i]
-                );
-            }
-        }
+}
+
+/// Get x-twiddle inverse for a single index at a given layer (for verifier).
+pub fn get_x_twiddle_inv(log_domain_size: usize, layer: usize, index: usize) -> M31 {
+    let log_layer_size = log_domain_size.saturating_sub(1 + layer);
+    if log_layer_size < 2 {
+        return M31::ONE;
     }
 
-    #[test]
-    fn test_fast_fft_extend() {
-        let fft = FastCircleFFT::new(3);  // size 8
-        
-        // Create coefficients (n/2 = 4 meaningful coefficients)
-        let coeffs: Vec<M31> = (0..4).map(|i| M31::new(i as u32)).collect();
-        let evals = fft.fft(&coeffs);
-        
-        // Extend to size 16
-        let extended = fft.extend(&evals, 1);
-        assert_eq!(extended.len(), 16);
+    let generator = CirclePoint::generator(log_layer_size + 1);
+    let shift = CirclePoint::generator(log_layer_size + 2);
+
+    let reversed_idx = bit_reverse(index, log_layer_size - 1);
+    let point = shift.mul(generator.pow(reversed_idx as u64));
+
+    if point.x.is_zero() {
+        M31::ZERO
+    } else {
+        point.x.inv()
     }
 }
diff --git a/crates/primitives/src/lib.rs b/crates/primitives/src/lib.rs
index a08dce8..2ac191e 100644
--- a/crates/primitives/src/lib.rs
+++ b/crates/primitives/src/lib.rs
@@ -4,9 +4,14 @@
 //! - Mersenne31 (M31) base field arithmetic
 //! - Quartic extension field (QM31) for security-critical operations
 //! - 16-bit limb utilities for 32-bit word decomposition
-//! - Circle group and Circle FFT for M31-native polynomial operations
+//! - Circle group and O(n log n) Circle FFT via Plonky3
 //! - Range-check helpers
 //! - Plonky3 interoperability for SIMD-optimized operations
+//!
+//! # Performance
+//!
+//! Circle FFT operations now use Plonky3's optimized O(n log n) implementation
+//! with SIMD acceleration (NEON on Apple Silicon, AVX2/512 on x86).
 
 pub mod field;
 pub mod extension;
@@ -17,5 +22,16 @@ pub mod p3_interop;
 pub use field::M31;
 pub use extension::{CM31, QM31, U_SQUARED};
 pub use limbs::{to_limbs, from_limbs};
-pub use circle::{CirclePoint, CircleDomain, CircleFFT, Coset, FastCircleFFT};
-pub use p3_interop::{to_p3, from_p3, P3M31};
+pub use circle::{
+    CirclePoint, CircleDomain, CircleFFT, Coset, FastCircleFFT,
+    // Plonky3 re-exports for advanced usage
+    P3CircleDomain, P3CircleEvaluations,
+    // Polynomial utilities
+    evaluate_poly, poly_mul, poly_add, poly_sub, poly_scale, poly_degree,
+    bit_reverse, bit_reverse_permutation, sqrt_m31,
+    // Circle FRI folding utilities
+    compute_y_twiddle_inverses, compute_x_twiddle_inverses, batch_inverse,
+    fold_y, fold_x, fold_y_single, fold_x_single,
+    get_y_twiddle_inv, get_x_twiddle_inv,
+};
+pub use p3_interop::{to_p3, from_p3, to_p3_vec, from_p3_vec, P3M31};
diff --git a/crates/primitives/src/p3_interop.rs b/crates/primitives/src/p3_interop.rs
index 7208609..fca73ee 100644
--- a/crates/primitives/src/p3_interop.rs
+++ b/crates/primitives/src/p3_interop.rs
@@ -12,9 +12,10 @@
 //!
 //! Using these can provide 2-8x speedups for field-heavy operations.
 //!
-//! # DFT Support
-//! 
-//! The `p3_fast_dft` function provides O(n log n) FFT using Plonky3's Radix2Dit.
+//! # Circle STARKs Support
+//!
+//! This module provides interop with Plonky3's Circle STARKs implementation,
+//! enabling O(n log n) Circle FFT operations with SIMD acceleration.
 
 use crate::field::M31;
 use p3_field::{PrimeCharacteristicRing, PrimeField32};
@@ -22,6 +23,9 @@ use p3_field::extension::Complex;
 pub use p3_dft::TwoAdicSubgroupDft;
 pub use p3_mersenne_31::{Mersenne31 as P3M31, Mersenne31ComplexRadix2Dit};
 
+// Re-export Circle STARKs types for direct usage
+pub use p3_circle::{CircleDomain as P3CircleDomain, CircleEvaluations as P3CircleEvaluations};
+
 /// Convert ZP1 M31 to Plonky3 Mersenne31.
 #[inline]
 pub fn to_p3(m: M31) -> P3M31 {
diff --git a/crates/prover/Cargo.toml b/crates/prover/Cargo.toml
index 9582462..55b5a6d 100644
--- a/crates/prover/Cargo.toml
+++ b/crates/prover/Cargo.toml
@@ -22,6 +22,19 @@ serde_json = { workspace = true }
 bytemuck = { workspace = true }
 thiserror = { workspace = true }
 
+# Plonky3 dependencies for optimized FRI and polynomial operations
+p3-mersenne-31 = { workspace = true }
+p3-field = { workspace = true }
+p3-circle = { workspace = true }
+p3-fri = { workspace = true }
+p3-commit = { workspace = true }
+p3-challenger = { workspace = true }
+p3-symmetric = { workspace = true }
+p3-merkle-tree = { workspace = true }
+p3-matrix = { workspace = true }
+p3-dft = { workspace = true }
+p3-util = { workspace = true }
+
 # Optional Metal GPU support (macOS only)
 metal = { version = "0.28", optional = true }
 
diff --git a/crates/prover/src/fri.rs b/crates/prover/src/fri.rs
index 51144e7..41a4a4b 100644
--- a/crates/prover/src/fri.rs
+++ b/crates/prover/src/fri.rs
@@ -9,15 +9,22 @@
 //! 2. Committing to the folded polynomial
 //! 3. Repeating until degree is small enough to send directly
 //!
-//! # Circle FRI Folding
+//! # Circle FRI Folding (Plonky3-compatible)
 //!
-//! For Circle STARKs, folding uses the twin-coset structure:
-//! - Points come in pairs (x, y) and (x, -y)
-//! - f_folded(x) = (f(x,y) + f(x,-y))/2 + α · (f(x,y) - f(x,-y))/(2y)
+//! For Circle STARKs, folding uses proper twiddle factors from the circle domain:
+//! - **First layer (y-fold)**: Uses inverse y-coordinates as twiddles
+//!   `f_folded[i] = (lo + hi) / 2 + β * (lo - hi) * y_inv[i] / 2`
+//! - **Subsequent layers (x-fold)**: Uses inverse x-coordinates as twiddles
+//!   `f_folded[i] = (lo + hi) / 2 + β * (lo - hi) * x_inv[i] / 2`
 //!
-//! This halves both the domain size and polynomial degree.
-
-use zp1_primitives::M31;
+//! This matches Plonky3's Circle FRI implementation for interoperability and
+//! uses batch inversion for O(n) twiddle computation instead of O(n) inversions.
+
+use zp1_primitives::{
+    M31,
+    compute_y_twiddle_inverses, compute_x_twiddle_inverses,
+    fold_y, fold_x,
+};
 use crate::channel::ProverChannel;
 use crate::commitment::MerkleTree;
 
@@ -155,18 +162,48 @@ pub struct FriLayerQueryProof {
 }
 
 /// FRI prover implementing the commit and query phases.
+///
+/// Uses Plonky3-compatible Circle FRI folding with precomputed twiddles
+/// for optimal performance (batch inversion instead of per-element inversion).
 pub struct FriProver {
     config: FriConfig,
+    /// Precomputed inverse y-twiddles for first fold layer.
+    y_twiddle_invs: Vec<M31>,
+    /// Precomputed inverse x-twiddles for subsequent fold layers.
+    x_twiddle_invs: Vec<Vec<M31>>,
 }
 
 impl FriProver {
     /// Create a new FRI prover with the given configuration.
+    ///
+    /// Precomputes all twiddle factors using batch inversion for efficiency.
     pub fn new(config: FriConfig) -> Self {
-        Self { config }
+        // Precompute y-twiddles for the first fold
+        let y_twiddle_invs = if config.log_domain_size >= 1 {
+            compute_y_twiddle_inverses(config.log_domain_size)
+        } else {
+            vec![]
+        };
+
+        // Precompute x-twiddles for subsequent folds
+        let num_x_layers = config.num_layers().saturating_sub(1);
+        let x_twiddle_invs: Vec<Vec<M31>> = (0..num_x_layers)
+            .map(|layer| compute_x_twiddle_inverses(config.log_domain_size, layer))
+            .collect();
+
+        Self {
+            config,
+            y_twiddle_invs,
+            x_twiddle_invs,
+        }
     }
 
     /// Commit phase: fold the polynomial repeatedly and commit to each layer.
     ///
+    /// Uses Plonky3-compatible Circle FRI folding:
+    /// - First layer uses y-fold with inverse y-twiddles
+    /// - Subsequent layers use x-fold with inverse x-twiddles
+    ///
     /// # Arguments
     /// * `evaluations` - Initial polynomial evaluations on the LDE domain
     /// * `channel` - Fiat-Shamir channel for challenges
@@ -182,7 +219,7 @@ impl FriProver {
             evaluations.len() == 1 << self.config.log_domain_size,
             "Evaluations must match domain size"
         );
-        
+
         let mut layers = Vec::with_capacity(self.config.num_layers());
         let mut current_evals = evaluations;
 
@@ -194,10 +231,22 @@ impl FriProver {
             layers.push(layer);
 
             // Get folding challenge from verifier (Fiat-Shamir)
-            let alpha = channel.squeeze_challenge();
+            let beta = channel.squeeze_challenge();
 
-            // Fold the polynomial
-            current_evals = self.fold_circle(&current_evals, alpha, layer_idx);
+            // Fold the polynomial using proper Circle FRI
+            current_evals = if layer_idx == 0 {
+                // First layer: y-fold
+                fold_y(&current_evals, beta, &self.y_twiddle_invs)
+            } else {
+                // Subsequent layers: x-fold
+                let x_layer = layer_idx - 1;
+                if x_layer < self.x_twiddle_invs.len() {
+                    fold_x(&current_evals, beta, &self.x_twiddle_invs[x_layer])
+                } else {
+                    // Fallback for very small domains
+                    self.fold_simple(&current_evals, beta)
+                }
+            };
         }
 
         // Final polynomial is small enough to send directly
@@ -219,58 +268,36 @@ impl FriProver {
         (layers, proof)
     }
 
-    /// Circle FRI folding: fold polynomial using twin-coset structure.
-    ///
-    /// For points at indices i and i + n/2 (which are twins on the circle),
-    /// we compute:
-    /// - f_folded[i] = (f[i] + f[i + n/2]) / 2 + alpha * (f[i] - f[i + n/2]) / (2 * y_i)
-    ///
-    /// This halves the domain size while maintaining the RS proximity property.
+    /// Simple folding for very small domains (fallback).
+    fn fold_simple(&self, evals: &[M31], beta: M31) -> Vec<M31> {
+        let half_n = evals.len() / 2;
+        let inv_two = M31::new(2).inv();
+
+        (0..half_n)
+            .map(|i| {
+                let lo = evals[2 * i];
+                let hi = evals[2 * i + 1];
+                let sum = lo + hi;
+                let diff = lo - hi;
+                (sum + beta * diff) * inv_two
+            })
+            .collect()
+    }
+
+    /// Legacy fold_circle method for backwards compatibility.
+    /// Uses the new optimized folding internally.
+    #[allow(dead_code)]
     fn fold_circle(&self, evals: &[M31], alpha: M31, layer: usize) -> Vec<M31> {
-        use zp1_primitives::CirclePoint;
-        
-        let n = evals.len();
-        let half_n = n / 2;
-        let mut folded = Vec::with_capacity(half_n);
-
-        // Two inverse constant
-        let two = M31::new(2);
-        let inv_two = two.inv();
-
-        // Get the circle domain generator for current layer
-        // Domain size at this layer = 2^(log_domain_size - layer)
-        let layer_log_size = self.config.log_domain_size.saturating_sub(layer);
-        let generator = CirclePoint::generator(layer_log_size);
-
-        for i in 0..half_n {
-            let f_pos = evals[i];
-            let f_neg = evals[i + half_n];
-
-            // Sum and difference
-            let sum = f_pos + f_neg;
-            let diff = f_pos - f_neg;
-
-            // Get y-coordinate of domain point i
-            // point_i = generator^i
-            let point_i = generator.pow(i as u64);
-            let y_i = point_i.y;
-            
-            // Proper Circle FRI folding formula:
-            // f_folded = (sum / 2) + alpha * (diff / (2 * y_i))
-            // = (sum / 2) + alpha * diff * inv_two * y_i^(-1)
-            let folded_val = if y_i.is_zero() {
-                // Edge case: y = 0 means we're at (1,0) or (-1,0)
-                // Just use the sum part
-                sum * inv_two
+        if layer == 0 {
+            fold_y(evals, alpha, &self.y_twiddle_invs)
+        } else {
+            let x_layer = layer - 1;
+            if x_layer < self.x_twiddle_invs.len() {
+                fold_x(evals, alpha, &self.x_twiddle_invs[x_layer])
             } else {
-                let y_inv = y_i.inv();
-                sum * inv_two + alpha * diff * inv_two * y_inv
-            };
-            
-            folded.push(folded_val);
+                self.fold_simple(evals, alpha)
+            }
         }
-
-        folded
     }
 
     /// Generate query proofs for all requested positions.
@@ -319,37 +346,44 @@ impl FriProver {
     }
     
     /// Verify a FRI proof (used by the verifier).
+    ///
+    /// Uses Plonky3-compatible Circle FRI folding verification:
+    /// - First layer uses y-fold with inverse y-twiddles
+    /// - Subsequent layers use x-fold with inverse x-twiddles
     pub fn verify(
         &self,
         proof: &FriProof,
         initial_commitment: &[u8; 32],
         channel: &mut ProverChannel,
     ) -> bool {
+        use zp1_primitives::{fold_y_single, fold_x_single, get_y_twiddle_inv, get_x_twiddle_inv};
+
         // Absorb initial commitment
         channel.absorb_commitment(initial_commitment);
-        
+
         // Collect challenges
         let mut challenges = Vec::with_capacity(proof.layer_commitments.len());
         for commitment in &proof.layer_commitments {
             channel.absorb_commitment(commitment);
             challenges.push(channel.squeeze_challenge());
         }
-        
+
         // Verify each query
         let query_indices = channel.squeeze_query_indices(
             self.config.num_queries,
             1 << self.config.log_domain_size,
         );
-        
+
         for (query_idx, query_proof) in proof.query_proofs.iter().enumerate() {
             if query_proof.index != query_indices[query_idx] {
                 return false;
             }
-            
+
             // Verify folding consistency through layers
             let mut current_idx = query_proof.index;
             let mut expected_value = None;
-            
+            let mut current_log_size = self.config.log_domain_size;
+
             for (layer_idx, layer_proof) in query_proof.layer_proofs.iter().enumerate() {
                 // If we have an expected value from previous folding, verify it
                 if let Some(expected) = expected_value {
@@ -357,21 +391,33 @@ impl FriProver {
                         return false;
                     }
                 }
-                
+
                 // Verify Merkle proof
                 // (In full implementation, would verify against layer commitment)
-                
-                // Compute expected folded value for next layer
-                let alpha = challenges[layer_idx];
-                let inv_two = M31::new(2).inv();
-                let sum = layer_proof.value + layer_proof.sibling_value;
-                let diff = layer_proof.value - layer_proof.sibling_value;
-                let folded = sum * inv_two + alpha * diff * inv_two;
-                
+
+                // Compute expected folded value for next layer using proper Circle FRI
+                let beta = challenges[layer_idx];
+                let lo = layer_proof.value;
+                let hi = layer_proof.sibling_value;
+
+                let folded = if layer_idx == 0 {
+                    // First layer: y-fold
+                    let twiddle_idx = current_idx / 2; // Index into y-twiddles
+                    let y_inv = get_y_twiddle_inv(current_log_size, twiddle_idx);
+                    fold_y_single(lo, hi, beta, y_inv)
+                } else {
+                    // Subsequent layers: x-fold
+                    let x_layer = layer_idx - 1;
+                    let twiddle_idx = current_idx / 2;
+                    let x_inv = get_x_twiddle_inv(current_log_size, x_layer, twiddle_idx);
+                    fold_x_single(lo, hi, beta, x_inv)
+                };
+
                 expected_value = Some(folded);
                 current_idx /= 2;
+                current_log_size = current_log_size.saturating_sub(1);
             }
-            
+
             // Verify final value matches final polynomial evaluation
             if let Some(expected) = expected_value {
                 let final_eval = evaluate_poly_at(&proof.final_poly, current_idx);
@@ -380,7 +426,7 @@ impl FriProver {
                 }
             }
         }
-        
+
         true
     }
 }
diff --git a/crates/prover/src/lde.rs b/crates/prover/src/lde.rs
index c1249ec..c26c422 100644
--- a/crates/prover/src/lde.rs
+++ b/crates/prover/src/lde.rs
@@ -1,27 +1,39 @@
-//! Low-degree extension (LDE) using Circle FFT.
+//! Low-degree extension (LDE) using Plonky3's Circle FFT.
 //!
 //! For Mersenne31, we use Circle STARKs - evaluation on the circle group
 //! { (x, y) : x^2 + y^2 = 1 } over M31, which provides FFT-friendly domains.
 //!
-//! The LDE process:
-//! 1. Interpolate trace values to get polynomial coefficients (iFFT)
-//! 2. Extend coefficients to larger domain
-//! 3. Evaluate on extended domain (FFT)
+//! # Plonky3 Integration
+//!
+//! This module now uses Plonky3's optimized O(n log n) Circle FFT with:
+//! - SIMD acceleration (NEON on Apple Silicon, AVX2/512 on x86)
+//! - Direct extrapolation for LDE (no separate iFFT + FFT)
+//! - Parallel processing via Rayon
+//!
+//! # LDE Process (Optimized)
+//!
+//! The LDE now uses Plonky3's `extrapolate` which combines:
+//! 1. Interpolation on trace domain
+//! 2. Evaluation on extended domain
+//! Into a single optimized operation.
 
 use zp1_primitives::{M31, CircleFFT, CirclePoint};
+use rayon::prelude::*;
 
-/// LDE domain configuration.
+/// LDE domain configuration with Plonky3 acceleration.
 #[derive(Clone, Debug)]
 pub struct LdeDomain {
     /// Log2 of the trace length.
     pub log_trace_len: usize,
     /// Blowup factor (typically 8 or 16).
     pub blowup: usize,
+    /// Log2 of the blowup factor.
+    log_blowup: usize,
     /// Log2 of the LDE domain size.
     pub log_domain_size: usize,
-    /// Circle FFT for trace domain.
+    /// Circle FFT for trace domain (handles both FFT and extrapolation).
     trace_fft: CircleFFT,
-    /// Circle FFT for extended domain.
+    /// Circle FFT for extended domain (for domain point access).
     extended_fft: CircleFFT,
 }
 
@@ -30,7 +42,7 @@ impl LdeDomain {
     pub fn new(trace_len: usize, blowup: usize) -> Self {
         assert!(trace_len.is_power_of_two(), "Trace length must be power of 2");
         assert!(blowup.is_power_of_two(), "Blowup must be power of 2");
-        
+
         let log_trace_len = trace_len.trailing_zeros() as usize;
         let log_blowup = blowup.trailing_zeros() as usize;
         let log_domain_size = log_trace_len + log_blowup;
@@ -38,6 +50,7 @@ impl LdeDomain {
         Self {
             log_trace_len,
             blowup,
+            log_blowup,
             log_domain_size,
             trace_fft: CircleFFT::new(log_trace_len),
             extended_fft: CircleFFT::new(log_domain_size),
@@ -68,20 +81,22 @@ impl LdeDomain {
         self.extended_fft.get_domain_point(i)
     }
 
-    /// Perform LDE on a single column using Circle FFT.
+    /// Perform LDE on a single column using Plonky3's extrapolate.
+    ///
+    /// This uses Plonky3's optimized extrapolation which combines
+    /// interpolation and evaluation into a single O(n log n) operation.
     pub fn extend(&self, values: &[M31]) -> Vec<M31> {
         assert_eq!(values.len(), self.trace_len(), "Input must match trace length");
-        
-        // Step 1: iFFT to get coefficients
-        let coeffs = self.trace_fft.ifft(values);
-        
-        // Step 2: FFT on extended domain (zero-padded coefficients)
-        self.extended_fft.fft(&coeffs)
+
+        // Use Plonky3's extrapolate for efficient LDE
+        self.trace_fft.extend(values, self.log_blowup)
     }
 
-    /// Perform LDE on multiple columns.
+    /// Perform LDE on multiple columns in parallel.
+    ///
+    /// Uses Rayon for parallel processing across columns.
     pub fn extend_columns(&self, columns: &[Vec<M31>]) -> Vec<Vec<M31>> {
-        columns.iter().map(|col| self.extend(col)).collect()
+        columns.par_iter().map(|col| self.extend(col)).collect()
     }
 
     /// Get trace FFT.
diff --git a/crates/verifier/src/verify.rs b/crates/verifier/src/verify.rs
index 4c0009a..2c24c49 100644
--- a/crates/verifier/src/verify.rs
+++ b/crates/verifier/src/verify.rs
@@ -543,6 +543,10 @@ impl Verifier {
     }
 
     /// Verify a single FRI query through all layers.
+    ///
+    /// Uses Plonky3-compatible Circle FRI folding:
+    /// - First layer: y-fold with inverse y-twiddles
+    /// - Subsequent layers: x-fold with inverse x-twiddles
     fn verify_fri_query(
         &self,
         fri_proof: &FriProof,
@@ -563,6 +567,8 @@ impl Verifier {
 
         let mut current_index = query.index;
         let mut expected_next: Option<M31> = None;
+        // Track current domain size for twiddle computation
+        let mut current_log_size = self.config.log_lde_domain_size();
 
         for (layer_idx, layer_proof) in query.layer_proofs.iter().enumerate() {
             let commitment = &fri_proof.layer_commitments[layer_idx];
@@ -589,13 +595,27 @@ impl Verifier {
                 }
             }
 
-            // Compute folded value for next layer (twin point folding)
-            let alpha = alphas.get(layer_idx).copied().ok_or_else(|| VerifyError::FriStructure {
+            // Compute folded value for next layer using proper Circle FRI
+            let beta = alphas.get(layer_idx).copied().ok_or_else(|| VerifyError::FriStructure {
                 reason: "Missing FRI alpha".into(),
             })?;
-            let folded = fri_utils::compute_fold(layer_proof.value, layer_proof.sibling_value, alpha);
+
+            let lo = layer_proof.value;
+            let hi = layer_proof.sibling_value;
+            let twiddle_idx = current_index / 2;
+
+            let folded = if layer_idx == 0 {
+                // First layer: y-fold
+                fri_utils::compute_fold_y(lo, hi, beta, current_log_size, twiddle_idx)
+            } else {
+                // Subsequent layers: x-fold
+                let x_layer = layer_idx - 1;
+                fri_utils::compute_fold_x(lo, hi, beta, current_log_size, x_layer, twiddle_idx)
+            };
+
             expected_next = Some(folded);
             current_index /= 2;
+            current_log_size = current_log_size.saturating_sub(1);
         }
 
         // Final polynomial check
@@ -621,17 +641,52 @@ impl Verifier {
     }
 }
 
-/// FRI verification helper functions.
+/// FRI verification helper functions with Plonky3-compatible Circle FRI folding.
 pub mod fri_utils {
-    use zp1_primitives::M31;
+    use zp1_primitives::{M31, fold_y_single, fold_x_single, get_y_twiddle_inv, get_x_twiddle_inv};
 
-    /// Compute FRI fold: f'(x^2) = f_even + alpha * f_odd
+    /// Compute Circle FRI fold for the first layer (y-fold).
+    ///
+    /// Uses proper y-twiddles from the circle domain for soundness.
+    ///
+    /// # Arguments
+    /// * `lo` - Value at even index
+    /// * `hi` - Value at odd index (twin)
+    /// * `beta` - Folding challenge
+    /// * `log_domain_size` - Log2 of current domain size
+    /// * `twiddle_idx` - Index into the twiddle array
+    pub fn compute_fold_y(lo: M31, hi: M31, beta: M31, log_domain_size: usize, twiddle_idx: usize) -> M31 {
+        let y_inv = get_y_twiddle_inv(log_domain_size, twiddle_idx);
+        fold_y_single(lo, hi, beta, y_inv)
+    }
+
+    /// Compute Circle FRI fold for subsequent layers (x-fold).
+    ///
+    /// Uses proper x-twiddles from the circle domain for soundness.
+    ///
+    /// # Arguments
+    /// * `lo` - Value at even index
+    /// * `hi` - Value at odd index (twin)
+    /// * `beta` - Folding challenge
+    /// * `log_domain_size` - Log2 of current domain size
+    /// * `x_layer` - Which x-folding layer (0 = first x-fold after y-fold)
+    /// * `twiddle_idx` - Index into the twiddle array
+    pub fn compute_fold_x(lo: M31, hi: M31, beta: M31, log_domain_size: usize, x_layer: usize, twiddle_idx: usize) -> M31 {
+        let x_inv = get_x_twiddle_inv(log_domain_size, x_layer, twiddle_idx);
+        fold_x_single(lo, hi, beta, x_inv)
+    }
+
+    /// Legacy compute_fold for backward compatibility.
+    ///
+    /// NOTE: This does NOT use proper twiddles and should be avoided for soundness.
+    /// Use `compute_fold_y` or `compute_fold_x` instead.
+    #[deprecated(note = "Use compute_fold_y or compute_fold_x for proper Circle FRI")]
     pub fn compute_fold(even: M31, odd: M31, alpha: M31) -> M31 {
-        // For twin-point folding on a circle: 0.5 * [(even + odd) + alpha * (even - odd)]
+        // Simple folding without twiddles (NOT sound for Circle FRI)
         let inv_two = M31::new(2).inv();
         let sum = even + odd;
         let diff = even - odd;
-        (sum * inv_two) + alpha * diff * inv_two
+        (sum + alpha * diff) * inv_two
     }
 
     /// Evaluate polynomial at a point using Horner's method.
@@ -639,7 +694,7 @@ pub mod fri_utils {
         if coeffs.is_empty() {
             return M31::ZERO;
         }
-        
+
         let mut result = coeffs[coeffs.len() - 1];
         for i in (0..coeffs.len() - 1).rev() {
             result = result * x + coeffs[i];
@@ -689,14 +744,31 @@ mod tests {
     }
 
     #[test]
-    fn test_fri_fold() {
-        let even = M31::new(10);
-        let odd = M31::new(20);
-        let alpha = M31::new(3);
-        
-        let folded = fri_utils::compute_fold(even, odd, alpha);
-        // (10+20)/2 + 3*(10-20)/2 = 15 + 3*(-10)/2 = 15 - 15 = 0
-        assert_eq!(folded.as_u32(), 0);
+    fn test_fri_fold_y() {
+        // Test Circle FRI y-fold
+        let lo = M31::new(10);
+        let hi = M31::new(20);
+        let beta = M31::new(3);
+
+        // Test y-fold at index 0 with domain size 2^4 = 16
+        let folded = fri_utils::compute_fold_y(lo, hi, beta, 4, 0);
+        // The result depends on the y-twiddle at index 0
+        // Just verify it produces a valid field element
+        assert!(folded.as_u32() < M31::P);
+    }
+
+    #[test]
+    fn test_fri_fold_x() {
+        // Test Circle FRI x-fold
+        let lo = M31::new(10);
+        let hi = M31::new(20);
+        let beta = M31::new(3);
+
+        // Test x-fold at layer 0, index 0 with domain size 2^4 = 16
+        let folded = fri_utils::compute_fold_x(lo, hi, beta, 4, 0, 0);
+        // The result depends on the x-twiddle
+        // Just verify it produces a valid field element
+        assert!(folded.as_u32() < M31::P);
     }
 
     #[test]