diff --git a/docs/index.rst b/docs/index.rst
index 481e633..745bd02 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -51,6 +51,7 @@ directions, and invariant along the vertical direction.
    installation
    usage
    convention   
+   optimization_tasks
    contributing
    authors
    history
diff --git a/docs/optimization_tasks.rst b/docs/optimization_tasks.rst
new file mode 100644
index 0000000..d6a5f3b
--- /dev/null
+++ b/docs/optimization_tasks.rst
@@ -0,0 +1,256 @@
+Optimization task definitions (1D mode + S-matrix caching)
+===========================================================
+
+This note turns the recent performance discussion into implementation
+tasks with explicit scope, physics constraints, and compatibility rules.
+
+
+Task A: Native 1D mode inferred from ``Nx``/``Ny``
+--------------------------------------------------
+
+Goal
+~~~~
+
+Accelerate simulations of structures that are periodic in only one lateral
+direction (e.g., binary gratings), while keeping the user-facing API
+compatible with existing code.
+
+Current issue
+~~~~~~~~~~~~~
+
+The current solver stack is built around 2D unit-cell discretization and a 2D
+harmonic index set ``G[:, 0], G[:, 1]``. This causes avoidable overhead when a
+layer is effectively 1D (for example ``Ny == 1``), because the code still
+executes a full 2D path for Fourier transforms, convolution indexing, and
+matrix assembly.
+
+Why this is necessary for performance
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In 1D gratings, the physically relevant Fourier basis is one index (``m``)
+instead of two indices (``m, n``). Keeping a 2D representation for a 1D
+problem inflates matrix dimensions and therefore CPU time and memory for
+eigensolves and S-matrix operations.
+
+Proposed optimization
+~~~~~~~~~~~~~~~~~~~~~
+
+Infer 1D mode directly from grid shape, without introducing a new required
+flag:
+
+* If ``Nx > 1`` and ``Ny == 1``: use x-periodic 1D path.
+* If ``Ny > 1`` and ``Nx == 1``: use y-periodic 1D path.
+* If ``Nx > 1`` and ``Ny > 1``: keep existing 2D path.
+* If ``Nx == 1`` and ``Ny == 1``: treat as degenerate/uniform and use current
+  behavior or raise a clear error.
+
+Important architectural rule:
+
+* Do **not** overload ``Gmethod`` to represent dimensionality. ``Gmethod``
+  should keep its current meaning (2D truncation strategy). 1D dimensional
+  inference should be orthogonal and handled before harmonic selection.
+
+Implementation outline
+~~~~~~~~~~~~~~~~~~~~~~
+
+1. Add an internal per-layer dimensional descriptor (e.g., ``periodic_dim``)
+   derived from ``Nx, Ny`` in ``Add_LayerGrid``.
+2. Introduce 1D FFT/convolution helpers for patterned layers with one singleton
+   axis.
+3. Add a 1D harmonic generator path that produces a single integer harmonic
+   list and map it to the existing matrix assembly interface with minimal
+   branching.
+4. Keep existing 2D APIs and default behavior unchanged for ``Nx > 1`` and
+   ``Ny > 1``.
+
+Backward compatibility and risks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* Existing 2D scripts remain unchanged.
+* Existing 1D-as-2D scripts (``Ny == 1`` or ``Nx == 1``) should produce
+  numerically consistent results, with lower runtime and memory use.
+* Risk: hidden assumptions that both harmonic axes exist in downstream code.
+  Mitigation: isolate 1D branching at Fourier/convolution and harmonic-index
+  construction boundaries; keep the rest of the solver working on consistent
+  shaped arrays.
+
+Physical validity constraints
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* Valid only when one in-plane direction is truly invariant.
+* Must preserve Maxwell boundary conditions and the same sign/branch
+  conventions used in the current eigensystem solver.
+* Polarization coupling behavior in the 1D path must remain consistent with
+  the existing field conventions.
+
+Acceptance criteria
+~~~~~~~~~~~~~~~~~~~
+
+* Unit tests showing that 1D inferred path (``Ny == 1`` or ``Nx == 1``)
+  matches the legacy 2D emulation within tolerance.
+* Benchmarks demonstrating lower runtime/memory for representative 1D grating
+  problems.
+
+
+Task B: Cache matrix inverses/factorizations in repeated S-matrix use
+----------------------------------------------------------------------
+
+Goal
+~~~~
+
+Reduce repeated linear-algebra cost in S-matrix assembly during consecutive
+solves where layer eigensystems are unchanged.
+
+Current issue
+~~~~~~~~~~~~~
+
+``GetSMatrix`` repeatedly computes expensive inverses inside layer loops,
+including ``inv(phi_l)`` and ``inv(kp_l @ phi_l)``. For repeated solves on the
+same solved state, these costs are paid again and again.
+
+Why this is necessary for performance
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Matrix inversion/factorization dominates runtime for moderate/large harmonic
+bases. Reusing already computed inverse/factorization objects can substantially
+reduce per-solve overhead in repeated post-processing workflows.
+
+Proposed optimization
+~~~~~~~~~~~~~~~~~~~~~
+
+Add cacheable per-layer linear algebra artifacts:
+
+* ``phi_inv`` (or factorization equivalent) for each solved layer.
+* factorization/inverse of ``kp @ phi`` used in ``GetSMatrix`` transitions.
+
+Use cached artifacts when valid; recompute on invalidation.
+
+Strict invalidation rules (must-have)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Cache is valid only if all dependencies are unchanged:
+
+* frequency ``omega``
+* incident-angle-dependent wavevectors ``kx, ky``
+* reciprocal lattice / harmonic set ``G``
+* patterned-layer Fourier material matrices (from epsilon grids)
+* layer thickness-dependent phase factors where applicable
+
+Any change to these must invalidate affected layer caches before S-matrix
+assembly.
+
+Backward compatibility and risks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* Public API should remain unchanged; caching is internal.
+* Results must be numerically identical to non-cached computation (within
+  floating-point tolerance).
+* Main risk is stale-cache reuse. Mitigation: conservative invalidation keyed
+  to the dependency set above.
+
+Physical validity constraints
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The eigenvectors ``phi`` are **not** universally invariant; they vary when the
+operator changes (e.g., through ``omega``, ``kx``, ``ky``, or epsilon Fourier
+matrices). Therefore caching is physically valid only with strict dependency
+tracking and invalidation.
+
+Acceptance criteria
+~~~~~~~~~~~~~~~~~~~
+
+* Regression tests proving cached and non-cached S-matrix paths agree.
+* Tests that modify one dependency at a time and verify cache invalidation.
+* Performance checks showing reduced repeated-solve wall time for unchanged
+  layer states.
+
+
+Task C: Incremental implementation plan for 1D support
+------------------------------------------------------
+
+Goal
+~~~~
+
+Define a practical, low-risk implementation sequence to deliver inferred 1D
+support quickly while preserving numerical correctness and backward
+compatibility.
+
+Why this task is separate from Task A
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Task A defines the target architecture. This task defines *how* to implement it
+in shippable stages, so performance gains are achieved early and regressions are
+contained.
+
+Proposed phased plan
+~~~~~~~~~~~~~~~~~~~~
+
+Phase 1 (detection + plumbing)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* Add internal detection logic from ``Nx, Ny``:
+
+  - ``Nx > 1, Ny == 1`` -> ``periodic_dim='x'``
+  - ``Ny > 1, Nx == 1`` -> ``periodic_dim='y'``
+  - otherwise -> existing 2D path
+
+* Store per-layer dimension metadata and surface it only internally (no public
+  API change).
+* Add explicit validation/error messages for ambiguous/degenerate grids.
+
+Phase 2 (1D Fourier path)
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* Implement 1D FFT/convolution helpers and dispatch to them for inferred 1D
+  layers.
+* Keep existing 2D helpers untouched for non-1D layers.
+* Ensure returned tensor/matrix shapes remain compatible with downstream
+  eigensystem code to minimize refactoring risk.
+
+Phase 3 (1D harmonic basis path)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* Add a dedicated 1D harmonic generator for a single index basis.
+* Keep ``Gmethod`` semantics unchanged for 2D truncation only.
+* Provide internal adapters so field reconstruction and post-processing APIs
+  keep working with both 1D and 2D solved states.
+
+Phase 4 (verification + docs + benchmark)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* Add regression tests comparing inferred 1D path against current 1D-as-2D
+  emulation on representative gratings.
+* Add benchmark cases demonstrating runtime and memory improvements.
+* Document the inference rules and edge-case behavior in user docs.
+
+Performance expectations
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+* Early wins in Phase 2 from avoiding unnecessary 2D FFT/convolution work for
+  singleton-axis grids.
+* Largest gains in Phase 3 once harmonic basis dimensionality is reduced from
+  2D to 1D for truly 1D periodic structures.
+
+Backward compatibility
+~~~~~~~~~~~~~~~~~~~~~~
+
+* No required new flags; existing scripts should continue to run unchanged.
+* Users currently setting ``Ny == 1`` or ``Nx == 1`` receive the optimized path
+  automatically.
+* 2D behavior remains the default whenever both dimensions exceed 1.
+
+Physical validity constraints
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* Use 1D path only when one in-plane direction is invariant.
+* Ensure matching boundary-condition treatment and branch/sign conventions
+  relative to existing 2D solver.
+* Include tests for both normal and oblique incidence to verify consistency of
+  field/polarization behavior.
+
+Acceptance criteria
+~~~~~~~~~~~~~~~~~~~
+
+* Stage-gated PRs where each phase is independently testable and reversible.
+* Numerical agreement with baseline 1D-as-2D results within tolerance.
+* Demonstrated performance gains on at least one binary-grating benchmark.
diff --git a/grcwa/rcwa.py b/grcwa/rcwa.py
index 5ebc034..cd2a6de 100644
--- a/grcwa/rcwa.py
+++ b/grcwa/rcwa.py
@@ -24,40 +24,46 @@ def __init__(self,nG,L1,L2,freq,theta,phi,verbose=1):
         self.nG = nG
         self.verbose = verbose
         self.Layer_N = 0  # total number of layers
-      
+
         # the length of the following variables = number of total layers
         self.thickness_list = []
         self.id_list = []  #[type, No., No. in patterned/uniform, No. in its family] starting from 0
-        # type:0 for uniform, 1 for Grids, 2 for Fourier
+        ## type:0 for uniform, 1 for Grids, 2 for Fourier
 
-        self.kp_list = []                
+        self.kp_list = []
         self.q_list = []  # eigenvalues
         self.phi_list = [] #eigenvectors
 
         # Uniform layer
         self.Uniform_ep_list = []
         self.Uniform_N = 0
-        
+
         # Patterned layer
-        self.Patterned_N = 0  # total number of patterned layers        
+        self.Patterned_N = 0  # total number of patterned layers
         self.Patterned_epinv_list = []
         self.Patterned_ep2_list = []
-        
+
         # patterned layer from Grids
         self.GridLayer_N = 0
         self.GridLayer_Nxy_list = []
 
         # layers of analytic Fourier series (e.g. circles)
         self.FourierLayer_N = 0
-        self.FourierLayer_params = []        
-        
+        self.FourierLayer_params = []
+
+        # cache for repeated S-matrix assembly
+        self._smatrix_cache = None
+
+        # inferred periodic dimension based on grid discretization
+        self.grid_periodic_dim = '2d'
+
     def Add_LayerUniform(self,thickness,epsilon):
         #assert type(thickness) == float, 'thickness should be a float'
 
         self.id_list.append([0,self.Layer_N,self.Uniform_N])
         self.Uniform_ep_list.append(epsilon)
         self.thickness_list.append(thickness)
-        
+
         self.Layer_N += 1
         self.Uniform_N += 1
 
@@ -79,6 +85,24 @@ def Add_LayerFourier(self,thickness,params):
         self.Patterned_N += 1
         self.FourierLayer_N += 1
 
+    def _infer_grid_periodic_dim(self):
+        # Infer effective periodicity from grid layer discretization.
+        # Returns: '2d', '1dx', or '1dy'.
+        if self.GridLayer_N == 0:
+            return '2d'
+
+        all_ny1 = True
+        all_nx1 = True
+        for Nx, Ny in self.GridLayer_Nxy_list:
+            all_ny1 = all_ny1 and (Ny == 1)
+            all_nx1 = all_nx1 and (Nx == 1)
+
+        if all_ny1 and not all_nx1:
+            return '1dx'
+        if all_nx1 and not all_ny1:
+            return '1dy'
+        return '2d'
+
     def Init_Setup(self,Pscale=1.,Gmethod=0):
         '''
         Set up reciprocal lattice (Gmethod:truncation scheme, 0 for circular, 1 for rectangular)
@@ -92,28 +116,39 @@ def Init_Setup(self,Pscale=1.,Gmethod=0):
         # set up reciprocal lattice
         self.Lk1, self.Lk2 = Lattice_Reciprocate(self.L1,self.L2)
         self.G,self.nG = Lattice_getG(self.nG,self.Lk1,self.Lk2,method=Gmethod)
-        
+
+        # infer effective 1D periodicity from grid discretization when possible
+        self.grid_periodic_dim = self._infer_grid_periodic_dim()
+        if self.grid_periodic_dim == '1dx':
+            ind = self.G[:,1] == 0
+            self.G = self.G[ind]
+            self.nG = len(self.G)
+        elif self.grid_periodic_dim == '1dy':
+            ind = self.G[:,0] == 0
+            self.G = self.G[ind]
+            self.nG = len(self.G)
+
         self.Lk1 = self.Lk1/Pscale
         self.Lk2 = self.Lk2/Pscale
         # self.kx = kx0 + 2*bd.pi*(self.Lk1[0]*self.G[:,0]+self.Lk2[0]*self.G[:,1])
         # self.ky = ky0 + 2*bd.pi*(self.Lk1[1]*self.G[:,0]+self.Lk2[1]*self.G[:,1])
         self.kx,self.ky = Lattice_SetKs(self.G, kx0, ky0, self.Lk1, self.Lk2)
-        
+
         #normalization factor for energies off normal incidence
         self.normalization = bd.sqrt(self.Uniform_ep_list[0])/bd.cos(self.theta)
-        
+
         #if comm.rank == 0 and verbose>0:
         if self.verbose>0:
             print('Total nG = ',self.nG)
 
         self.Patterned_ep2_list = [None]*self.Patterned_N
-        self.Patterned_epinv_list = [None]*self.Patterned_N            
+        self.Patterned_epinv_list = [None]*self.Patterned_N
         for i in range(self.Layer_N):
             if self.id_list[i][0] == 0:
                 ep = self.Uniform_ep_list[self.id_list[i][2]]
                 kp = MakeKPMatrix(self.omega,0,1./ep,self.kx,self.ky)
                 self.kp_list.append(kp)
-                
+
                 q,phi = SolveLayerEigensystem_uniform(self.omega,self.kx,self.ky,ep)
                 self.q_list.append(q)
                 self.phi_list.append(phi)
@@ -121,7 +156,36 @@ def Init_Setup(self,Pscale=1.,Gmethod=0):
                 self.kp_list.append(None)
                 self.q_list.append(None)
                 self.phi_list.append(None)
-                
+
+        self._invalidate_smatrix_cache()
+
+    def _invalidate_smatrix_cache(self):
+        self._smatrix_cache = None
+
+    def _get_smatrix_cache(self):
+        if self._smatrix_cache is not None:
+            return self._smatrix_cache
+
+        if len(self.phi_list) != self.Layer_N or len(self.kp_list) != self.Layer_N:
+            return None
+
+        phi_inv_list = [None] * self.Layer_N
+        kpphi_inv_list = [None] * self.Layer_N
+
+        for i in range(self.Layer_N):
+            phi = self.phi_list[i]
+            kp = self.kp_list[i]
+            if bd.isinstance(phi, type(None)) or bd.isinstance(kp, type(None)):
+                return None
+            phi_inv_list[i] = bd.inv(phi)
+            kpphi_inv_list[i] = bd.inv(bd.dot(kp, phi))
+
+        self._smatrix_cache = {
+            'phi_inv_list': phi_inv_list,
+            'kpphi_inv_list': kpphi_inv_list,
+        }
+        return self._smatrix_cache
+
     def MakeExcitationPlanewave(self,p_amp,p_phase,s_amp,s_phase,order = 0, direction = 'forward'):
         '''
         Front incidence
@@ -138,7 +202,7 @@ def MakeExcitationPlanewave(self,p_amp,p_phase,s_amp,s_phase,order = 0, directio
                         -p_amp*bd.sin(phi)*bd.exp(1j*p_phase))
 
             tmp2 = bd.zeros(2*self.nG,dtype=complex)
-            tmp2[order+self.nG] = 1.0            
+            tmp2[order+self.nG] = 1.0
             a0 = a0 + tmp2*(-s_amp*bd.cos(theta)*bd.sin(phi)*bd.exp(1j*s_phase) \
                             +p_amp*bd.cos(phi)*bd.exp(1j*p_phase))
         elif direction == 'backward':
@@ -151,20 +215,22 @@ def MakeExcitationPlanewave(self,p_amp,p_phase,s_amp,s_phase,order = 0, directio
             tmp2[order+self.nG] = 1.0
             bN = bN + tmp2*(-s_amp*bd.cos(theta)*bd.sin(phi)*bd.exp(1j*s_phase) \
                             +p_amp*bd.cos(phi)*bd.exp(1j*p_phase))
-        
+
         self.a0 = a0
         self.bN = bN
-        
+
     def GridLayer_geteps(self,ep_all):
         '''
         Fourier transform + eigenvalue for grid layer
         '''
+        self._invalidate_smatrix_cache()
+
         ptri = 0
         ptr = 0
         for i in range(self.Layer_N):
             if self.id_list[i][0] != 1:
                 continue
-            
+
             Nx = self.GridLayer_Nxy_list[ptri][0]
             Ny = self.GridLayer_Nxy_list[ptri][1]
             dN = 1./Nx/Ny
@@ -173,7 +239,7 @@ def GridLayer_geteps(self,ep_all):
                 ep_grid = [bd.reshape(ep_all[0][ptr:ptr+Nx*Ny],[Nx,Ny]),bd.reshape(ep_all[1][ptr:ptr+Nx*Ny],[Nx,Ny]),bd.reshape(ep_all[2][ptr:ptr+Nx*Ny],[Nx,Ny])]
             else:
                 ep_grid = bd.reshape(ep_all[ptr:ptr+Nx*Ny],[Nx,Ny])
-            
+
             epinv, ep2 = Epsilon_fft(dN,ep_grid,self.G)
 
             self.Patterned_epinv_list[self.id_list[i][2]] = epinv
@@ -187,7 +253,7 @@ def GridLayer_geteps(self,ep_all):
             self.phi_list[self.id_list[i][1]] = phi
 
             ptr += Nx*Ny
-            ptri += 1            
+            ptri += 1
 
     def Return_eps(self,which_layer,Nx,Ny,component='xx'):
         '''
@@ -212,10 +278,10 @@ def Return_eps(self,which_layer,Nx,Ny,component='xx'):
                 epk = self.Patterned_ep2_list[self.id_list[i][2]][self.nG:,:self.nG]
             elif component == 'yy':
                 epk = self.Patterned_ep2_list[self.id_list[i][2]][self.nG:,self.nG:]
-                
+
             return get_ifft(Nx,Ny,epk[0,:],self.G)
 
-            
+
     def RT_Solve(self,normalize = 0, byorder = 0):
         '''
         Reflection and transmission power computation
@@ -224,7 +290,7 @@ def RT_Solve(self,normalize = 0, byorder = 0):
 
         if normalize = 1, it will be divided by n[0]*cos(theta)
         '''
-        aN, b0 = SolveExterior(self.a0,self.bN,self.q_list,self.phi_list,self.kp_list,self.thickness_list)
+        aN, b0 = SolveExterior(self.a0,self.bN,self.q_list,self.phi_list,self.kp_list,self.thickness_list,self._get_smatrix_cache())
         fi,bi = GetZPoyntingFlux(self.a0,b0,self.omega,self.kp_list[0],self.phi_list[0],self.q_list[0],byorder=byorder)
         fe,be = GetZPoyntingFlux(aN,self.bN,self.omega,self.kp_list[-1],self.phi_list[-1],self.q_list[-1],byorder=byorder)
 
@@ -245,40 +311,40 @@ def GetAmplitudes_noTranslate(self,which_layer):
         returns fourier amplitude
         '''
         if which_layer == 0 :
-            aN, b0 = SolveExterior(self.a0,self.bN,self.q_list,self.phi_list,self.kp_list,self.thickness_list)
+            aN, b0 = SolveExterior(self.a0,self.bN,self.q_list,self.phi_list,self.kp_list,self.thickness_list,self._get_smatrix_cache())
             ai = self.a0
             bi = b0
 
         elif which_layer == self.Layer_N-1:
-            aN, b0 = SolveExterior(self.a0,self.bN,self.q_list,self.phi_list,self.kp_list,self.thickness_list)
+            aN, b0 = SolveExterior(self.a0,self.bN,self.q_list,self.phi_list,self.kp_list,self.thickness_list,self._get_smatrix_cache())
             ai = aN
             bi = self.bN
 
         else:
-            ai, bi = SolveInterior(which_layer,self.a0,self.bN,self.q_list,self.phi_list,self.kp_list,self.thickness_list)
+            ai, bi = SolveInterior(which_layer,self.a0,self.bN,self.q_list,self.phi_list,self.kp_list,self.thickness_list,self._get_smatrix_cache())
         return ai,bi
-    
+
     def GetAmplitudes(self,which_layer,z_offset):
         '''
         returns fourier amplitude
         '''
         if which_layer == 0 :
-            aN, b0 = SolveExterior(self.a0,self.bN,self.q_list,self.phi_list,self.kp_list,self.thickness_list)
+            aN, b0 = SolveExterior(self.a0,self.bN,self.q_list,self.phi_list,self.kp_list,self.thickness_list,self._get_smatrix_cache())
             ai = self.a0
             bi = b0
 
         elif which_layer == self.Layer_N-1:
-            aN, b0 = SolveExterior(self.a0,self.bN,self.q_list,self.phi_list,self.kp_list,self.thickness_list)
+            aN, b0 = SolveExterior(self.a0,self.bN,self.q_list,self.phi_list,self.kp_list,self.thickness_list,self._get_smatrix_cache())
             ai = aN
             bi = self.bN
 
         else:
-            ai, bi = SolveInterior(which_layer,self.a0,self.bN,self.q_list,self.phi_list,self.kp_list,self.thickness_list)
+            ai, bi = SolveInterior(which_layer,self.a0,self.bN,self.q_list,self.phi_list,self.kp_list,self.thickness_list,self._get_smatrix_cache())
 
         ai, bi = TranslateAmplitudes(self.q_list[which_layer],self.thickness_list[which_layer],z_offset,ai,bi)
 
         return ai,bi
-    
+
     def Solve_FieldFourier(self,which_layer,z_offset):
         '''
         returns field amplitude in fourier space: [ex,ey,ez], [hx,hy,hz]
@@ -305,7 +371,7 @@ def Solve_FieldFourier(self,which_layer,z_offset):
             fexy = bd.dot(self.kp_list[which_layer],tmp2)
             fey = - fexy[:self.nG]
             fex = fexy[self.nG:]
-        
+
             #hz in Fourier space
             fhz = (self.kx*fey - self.ky*fex)/self.omega
 
@@ -349,7 +415,7 @@ def Solve_FieldOnGrid(self,which_layer,z_offset,Nxy=None):
 
     def Volume_integral(self,which_layer,Mx,My,Mz,normalize=0):
         '''Mxyz is convolution matrix.
-        This function computes 1/A\int_V Mx|Ex|^2+My|Ey|^2+Mz|Ez|^2
+        This function computes 1/A\\int_V Mx|Ex|^2+My|Ey|^2+Mz|Ez|^2
         To be consistent with Poynting vector defintion here, the absorbed power will be just omega*output
         '''
         kp = self.kp_list[which_layer]
@@ -362,10 +428,10 @@ def Volume_integral(self,which_layer,Mx,My,Mz,normalize=0):
             epinv = self.Patterned_epinv_list[self.id_list[which_layer][2]]
 
         # un-translated amplitdue
-        ai, bi = SolveInterior(which_layer,self.a0,self.bN,self.q_list,self.phi_list,self.kp_list,self.thickness_list)
+        ai, bi = SolveInterior(which_layer,self.a0,self.bN,self.q_list,self.phi_list,self.kp_list,self.thickness_list,self._get_smatrix_cache())
         ab = bd.hstack((ai,bi))
         abMatrix = bd.outer(ab,bd.conj(ab))
-        
+
         Mt = Matrix_zintegral(q,self.thickness_list[which_layer])
         # overall
         abM = abMatrix * Mt
@@ -386,14 +452,14 @@ def Volume_integral(self,which_layer,Mx,My,Mz,normalize=0):
                             bd.hstack((Mzeros,My,Mzeros)),\
                             bd.hstack((Mzeros,Mzeros,Mz))))
 
-        # integral = Tr[ abMatrix * F^\dagger *  Matconv *F ] 
+        # integral = Tr[ abMatrix * F^\dagger *  Matconv *F ]
         tmp = bd.dot(bd.dot(bd.conj(bd.transpose(F)),Mtotal),F)
         val = bd.trace(bd.dot(abM,tmp))
 
         if normalize == 1:
             val = val*self.normalization
         return val
-        
+
     def Solve_ZStressTensorIntegral(self,which_layer):
         '''
         returns 2F_x,2F_y,2F_z, integrated over z-plane
@@ -436,7 +502,7 @@ def Solve_ZStressTensorIntegral(self,which_layer):
 
 def MakeKPMatrix(omega,layer_type,epinv,kx,ky):
     nG = len(kx)
-    
+
     # uniform layer, epinv has length 1
     if layer_type == 0:
         # JkkJT = np.block([[np.diag(ky*ky), np.diag(-ky*kx)],
@@ -444,14 +510,14 @@ def MakeKPMatrix(omega,layer_type,epinv,kx,ky):
 
         Jk = bd.vstack((bd.diag(-ky),bd.diag(kx)))
         JkkJT = bd.dot(Jk,bd.transpose(Jk))
-        
+
         kp = omega**2*bd.eye(2*nG) - epinv*JkkJT
     # patterned layer
     else:
         Jk = bd.vstack((bd.diag(-ky),bd.diag(kx)))
         tmp = bd.dot(Jk,epinv)
         kp = omega**2*bd.eye(2*nG) - bd.dot(tmp,bd.transpose(Jk))
-        
+
     return kp
 
 def SolveLayerEigensystem_uniform(omega,kx,ky,epsilon):
@@ -466,11 +532,11 @@ def SolveLayerEigensystem_uniform(omega,kx,ky,epsilon):
 
 def SolveLayerEigensystem(omega,kx,ky,kp,ep2):
     nG = len(kx)
-    
+
     k = bd.vstack((bd.diag(kx),bd.diag(ky)))
     kkT = bd.dot(k,bd.transpose(k))
     M = bd.dot(ep2,kp) - kkT
-    
+
     q,phi = bd.eig(M)
 
     q = bd.sqrt(q)
@@ -478,12 +544,12 @@ def SolveLayerEigensystem(omega,kx,ky,kp,ep2):
     q = bd.where(bd.imag(q)<0.,-q,q)
     return q,phi
 
-def GetSMatrix(indi,indj,q_list,phi_list,kp_list,thickness_list):
+def GetSMatrix(indi,indj,q_list,phi_list,kp_list,thickness_list,smatrix_cache=None):
     ''' S_ij: size 4n*4n
     '''
     #assert type(indi) == int, 'layer index i must be integar'
     #assert type(indj) == int, 'layer index j must be integar'
-    
+
     nG2 = len(q_list[0])
     S11 = bd.eye(nG2,dtype=complex)
     S12 = bd.zeros_like(S11)
@@ -493,15 +559,22 @@ def GetSMatrix(indi,indj,q_list,phi_list,kp_list,thickness_list):
         return S11,S12,S21,S22
     elif indi>indj:
         raise Exception('indi must be < indj')
-   
+
     for l in range(indi,indj):
         ## next layer
         lp1 = l+1
 
         ## Q = inv(phi_l) * phi_lp1
-        Q = bd.dot(bd.inv(phi_list[l]),  phi_list[lp1])
+        if smatrix_cache is None:
+            phi_inv_l = bd.inv(phi_list[l])
+            kpphi_inv_l = bd.inv(bd.dot(kp_list[l],phi_list[l]))
+        else:
+            phi_inv_l = smatrix_cache['phi_inv_list'][l]
+            kpphi_inv_l = smatrix_cache['kpphi_inv_list'][l]
+
+        Q = bd.dot(phi_inv_l, phi_list[lp1])
         ## P = ql*inv(kp_l*phi_l) * kp_lp1*phi_lp1*q_lp1^-1
-        P1 = bd.dot(bd.diag(q_list[l]),   bd.inv(bd.dot(kp_list[l],phi_list[l])))
+        P1 = bd.dot(bd.diag(q_list[l]), kpphi_inv_l)
         P2 = bd.dot(bd.dot(kp_list[lp1],phi_list[lp1]),   bd.diag(1./q_list[lp1]))
         P = bd.dot(P1,P2)
         # P1 = bd.dot(kp_list[l],phi_list[l])
@@ -533,32 +606,32 @@ def GetSMatrix(indi,indj,q_list,phi_list,kp_list,thickness_list):
         P2 = bd.dot(S22,bd.dot(T12,S12))
         P1 = bd.dot(S22,bd.dot(T11,d2))
         S22 = P1 + P2
-        
+
     return S11,S12,S21,S22
 
-def SolveExterior(a0,bN,q_list,phi_list,kp_list,thickness_list):
+def SolveExterior(a0,bN,q_list,phi_list,kp_list,thickness_list,smatrix_cache=None):
     '''
     Given a0, bN, solve for b0, aN
     '''
 
     Nlayer = len(thickness_list) # total number of layers
-    S11, S12, S21, S22 = GetSMatrix(0,Nlayer-1,q_list,phi_list,kp_list,thickness_list)
+    S11, S12, S21, S22 = GetSMatrix(0,Nlayer-1,q_list,phi_list,kp_list,thickness_list,smatrix_cache=smatrix_cache)
 
     aN = bd.dot(S11,a0) + bd.dot(S12,bN)
     b0 = bd.dot(S21,a0) + bd.dot(S22,bN)
 
     return aN,b0
 
-def SolveInterior(which_layer,a0,bN,q_list,phi_list,kp_list,thickness_list):
+def SolveInterior(which_layer,a0,bN,q_list,phi_list,kp_list,thickness_list,smatrix_cache=None):
     '''
     Given a0, bN, solve for ai, bi
     Layer numbering starts from 0
     '''
     Nlayer = len(thickness_list) # total number of layers
     nG2 = len(q_list[0])
-    
-    S11, S12, S21, S22 = GetSMatrix(0,which_layer,q_list,phi_list,kp_list,thickness_list)
-    pS11, pS12, pS21, pS22 = GetSMatrix(which_layer,Nlayer-1,q_list,phi_list,kp_list,thickness_list)
+
+    S11, S12, S21, S22 = GetSMatrix(0,which_layer,q_list,phi_list,kp_list,thickness_list,smatrix_cache=smatrix_cache)
+    pS11, pS12, pS21, pS22 = GetSMatrix(which_layer,Nlayer-1,q_list,phi_list,kp_list,thickness_list,smatrix_cache=smatrix_cache)
 
     # tmp = inv(1-S12*pS21)
     tmp = bd.inv(bd.eye(nG2)-bd.dot(S12,pS21))
@@ -566,14 +639,14 @@ def SolveInterior(which_layer,a0,bN,q_list,phi_list,kp_list,thickness_list):
     ai = bd.dot(tmp,  bd.dot(S11,a0)+bd.dot(S12,bd.dot(pS22,bN)))
     # bi = pS21 ai + pS22 bN
     bi = bd.dot(pS21,ai) + bd.dot(pS22,bN)
-    
+
     return ai,bi
 
 def TranslateAmplitudes(q,thickness,dz,ai,bi):
     aim = ai*bd.exp(1j*q*dz)
     bim = bi*bd.exp(1j*q*(thickness-dz))
     return aim,bim
-        
+
 
 def GetZPoyntingFlux(ai,bi,omega,kp,phi,q,byorder=0):
     '''
diff --git a/tests/test_rcwa.py b/tests/test_rcwa.py
index a0acc73..1758bd1 100644
--- a/tests/test_rcwa.py
+++ b/tests/test_rcwa.py
@@ -1,5 +1,6 @@
 import numpy as np
 import grcwa
+import grcwa.rcwa as rcwa_core
 from .utils import t_grad
 
 try:
@@ -88,6 +89,146 @@ def test_rcwa():
     Tx,Ty,Tz = obj.Solve_ZStressTensorIntegral(0)
     assert Tz<0
 
+
+def test_smatrix_cache_unavailable_before_patterned_setup():
+    planewave={'p_amp':1,'s_amp':0,'p_phase':0,'s_phase':0}
+    obj = grcwa.obj(nG, L1, L2, freq, theta, phi, verbose=0)
+    obj.Add_LayerUniform(thick0, epsuniform0)
+    obj.Add_LayerGrid(pthick[0], Nx, Ny)
+    obj.Add_LayerUniform(thickN, epsuniformN)
+    obj.Init_Setup(Pscale=1., Gmethod=0)
+    obj.MakeExcitationPlanewave(planewave['p_amp'],planewave['p_phase'],
+                                planewave['s_amp'],planewave['s_phase'],order=0)
+
+    # Patterned layer eigensystem not yet available
+    assert obj._get_smatrix_cache() is None
+
+
+def test_getsmatrix_cached_vs_uncached_identical():
+    planewave={'p_amp':1,'s_amp':0,'p_phase':0,'s_phase':0}
+    obj = rcwa_assembly(epgrid, freq, theta, phi, planewave, pthick, Pscale=1.)
+
+    cache = obj._get_smatrix_cache()
+    assert cache is not None
+    assert len(cache['phi_inv_list']) == obj.Layer_N
+    assert len(cache['kpphi_inv_list']) == obj.Layer_N
+
+    s_cached = rcwa_core.GetSMatrix(0, obj.Layer_N-1, obj.q_list, obj.phi_list,
+                                    obj.kp_list, obj.thickness_list,
+                                    smatrix_cache=cache)
+    s_plain = rcwa_core.GetSMatrix(0, obj.Layer_N-1, obj.q_list, obj.phi_list,
+                                   obj.kp_list, obj.thickness_list,
+                                   smatrix_cache=None)
+
+    for blk_cached, blk_plain in zip(s_cached, s_plain):
+        assert np.allclose(blk_cached, blk_plain)
+
+
+def test_exterior_interior_cached_vs_uncached_identical():
+    planewave={'p_amp':1,'s_amp':0,'p_phase':0,'s_phase':0}
+    obj = rcwa_assembly(epgrid, freq, theta, phi, planewave, pthick, Pscale=1.)
+
+    cache = obj._get_smatrix_cache()
+    aN_c, b0_c = rcwa_core.SolveExterior(obj.a0, obj.bN, obj.q_list, obj.phi_list,
+                                         obj.kp_list, obj.thickness_list,
+                                         smatrix_cache=cache)
+    aN_p, b0_p = rcwa_core.SolveExterior(obj.a0, obj.bN, obj.q_list, obj.phi_list,
+                                         obj.kp_list, obj.thickness_list,
+                                         smatrix_cache=None)
+    assert np.allclose(aN_c, aN_p)
+    assert np.allclose(b0_c, b0_p)
+
+    ai_c, bi_c = rcwa_core.SolveInterior(1, obj.a0, obj.bN, obj.q_list, obj.phi_list,
+                                         obj.kp_list, obj.thickness_list,
+                                         smatrix_cache=cache)
+    ai_p, bi_p = rcwa_core.SolveInterior(1, obj.a0, obj.bN, obj.q_list, obj.phi_list,
+                                         obj.kp_list, obj.thickness_list,
+                                         smatrix_cache=None)
+    assert np.allclose(ai_c, ai_p)
+    assert np.allclose(bi_c, bi_p)
+
+def test_smatrix_cache_reuse_and_invalidate():
+    planewave={'p_amp':1,'s_amp':0,'p_phase':0,'s_phase':0}
+    obj = rcwa_assembly(epgrid,freq,theta,phi,planewave,pthick,Pscale=1.)
+
+    # First solve builds cache
+    R1, T1 = obj.RT_Solve(normalize=0)
+    assert obj._smatrix_cache is not None
+    old_kpphi = np.array(obj._smatrix_cache['kpphi_inv_list'][1], copy=True)
+
+    # Repeated solve should reuse the same cache object and result
+    R2, T2 = obj.RT_Solve(normalize=0)
+    assert np.allclose([R1, T1], [R2, T2])
+
+    # Updating patterned epsilon should invalidate and rebuild cache
+    epgrid2 = np.array(epgrid, copy=True)
+    epgrid2[0, 0] = epgrid2[0, 0] + 0.5
+    obj.GridLayer_geteps(epgrid2.flatten())
+    assert obj._smatrix_cache is None
+
+    obj.RT_Solve(normalize=0)
+    assert obj._smatrix_cache is not None
+    assert not np.allclose(obj._smatrix_cache['kpphi_inv_list'][1], old_kpphi)
+
+
+
+def _assemble_1d_compare_case(nG_case=41, Nx_case=41, Ny_case=9, freq_case=0.85):
+    L1c = [0.2, 0]
+    L2c = [0, 0.2]
+    theta_c = np.pi / 20
+    phi_c = 0.
+
+    # 1D profile varying along x only
+    x = np.linspace(0, 1., Nx_case)
+    eps_x = np.where(np.abs(x - 0.5) < 0.2, 10.0, 2.0)
+
+    ep_2d = np.repeat(eps_x[:, None], Ny_case, axis=1)
+    ep_1d = eps_x[:, None]
+
+    planewave_c = {'p_amp': 1, 's_amp': 0, 'p_phase': 0, 's_phase': 0}
+
+    # baseline 2D emulation
+    obj2 = grcwa.obj(nG_case, L1c, L2c, freq_case, theta_c, phi_c, verbose=0)
+    obj2.Add_LayerUniform(0.4, 1.0)
+    obj2.Add_LayerGrid(0.2, Nx_case, Ny_case)
+    obj2.Add_LayerUniform(0.4, 1.0)
+    obj2.Init_Setup(Gmethod=1)
+    obj2.MakeExcitationPlanewave(planewave_c['p_amp'], planewave_c['p_phase'],
+                                 planewave_c['s_amp'], planewave_c['s_phase'], order=0)
+    obj2.GridLayer_geteps(ep_2d.flatten())
+
+    # inferred 1D case (Ny=1)
+    obj1 = grcwa.obj(nG_case, L1c, L2c, freq_case, theta_c, phi_c, verbose=0)
+    obj1.Add_LayerUniform(0.4, 1.0)
+    obj1.Add_LayerGrid(0.2, Nx_case, 1)
+    obj1.Add_LayerUniform(0.4, 1.0)
+    obj1.Init_Setup(Gmethod=1)
+    obj1.MakeExcitationPlanewave(planewave_c['p_amp'], planewave_c['p_phase'],
+                                 planewave_c['s_amp'], planewave_c['s_phase'], order=0)
+    obj1.GridLayer_geteps(ep_1d.flatten())
+
+    return obj2, obj1
+
+
+def test_1d_inference_reduces_to_single_harmonic_axis():
+    obj2, obj1 = _assemble_1d_compare_case()
+
+    assert obj2.grid_periodic_dim == '2d'
+    assert obj1.grid_periodic_dim == '1dx'
+    assert np.all(obj1.G[:, 1] == 0)
+    assert obj1.nG <= obj2.nG
+
+
+def test_1d_inference_matches_2d_emulation_for_rt_points():
+    freqs = [0.7, 0.85, 1.0]
+    for f in freqs:
+        obj2, obj1 = _assemble_1d_compare_case(freq_case=f)
+        R2, T2 = obj2.RT_Solve(normalize=1)
+        R1, T1 = obj1.RT_Solve(normalize=1)
+
+        assert np.allclose(R1, R2, rtol=2e-3, atol=1e-5)
+        assert np.allclose(T1, T2, rtol=2e-3, atol=1e-5)
+
 if AG_AVAILABLE:
     grcwa.set_backend('autograd')
     def test_epsgrad():