diff --git a/app/helpers/miscellaneous.py b/app/helpers/miscellaneous.py
index 0091fdb9..8582f939 100644
--- a/app/helpers/miscellaneous.py
+++ b/app/helpers/miscellaneous.py
@@ -1151,14 +1151,26 @@ def keypoints_adjustments(
 
     # --- MANUAL ALIGNMENTS (Sliders) ---
     if parameters.get("FaceAdjEnableToggle", False):
-        kps_5_adj[:, 0] += parameters["KpsXSlider"]
-        kps_5_adj[:, 1] += parameters["KpsYSlider"]
-        kps_5_adj[:, 0] -= 255
-        kps_5_adj[:, 0] *= 1 + parameters["KpsScaleSlider"] / 100.0
-        kps_5_adj[:, 0] += 255
-        kps_5_adj[:, 1] -= 255
-        kps_5_adj[:, 1] *= 1 + parameters["KpsScaleSlider"] / 100.0
-        kps_5_adj[:, 1] += 255
+        # 1. Apply spatial translations (X / Y Axis)
+        # Adjusts the facial keypoints position based on user-defined offsets.
+        kps_5_adj[:, 0] += parameters.get("KpsXSlider", 0.0)
+        kps_5_adj[:, 1] += parameters.get("KpsYSlider", 0.0)
+
+        # 2. Apply spatial scaling
+        # Resizes the face representation while maintaining its relative geometry.
+        scale_val = parameters.get("KpsScaleSlider", 0.0)
+        if scale_val != 0.0:
+            scale_factor = 1.0 + (scale_val / 100.0)
+
+            # FW-BUG-FIX: Dynamic Centroid Calculation.
+            # Replaced the hardcoded '255' center with the actual barycenter
+            # of the face keypoints. This prevents unwanted translation (drift)
+            # when resizing faces that are not perfectly centered at (255, 255).
+            centroid = np.mean(kps_5_adj, axis=0)  # Returns array([mean_x, mean_y])
+
+            # Vectorized scaling: (Point - Centroid) * Scale + Centroid
+            # Computes both X and Y axes simultaneously for optimal NumPy performance.
+            kps_5_adj = (kps_5_adj - centroid) * scale_factor + centroid
 
     if (
         parameters.get("LandmarksPositionAdjEnableToggle", False)
diff --git a/app/processors/face_detectors.py b/app/processors/face_detectors.py
index d81d295d..e75d210d 100644
--- a/app/processors/face_detectors.py
+++ b/app/processors/face_detectors.py
@@ -236,7 +236,9 @@ def _filter_detections_gpu(
         skip_nms=False,
     ):
         """
-        Performs GPU-accelerated NMS, sorting, and filtering on raw detections from all angles.
+        Performs GPU-accelerated NMS, automatic heuristic sorting, and filtering on raw detections.
+        Designed for fully automated pipelines: filters out bad rotation artifacts via geometric
+        sanity checks and elects the best candidates using a non-linear Confidence/Area heuristic.
 
         Args:
             scores_list (list): List of score arrays (np.ndarray) from each detection angle.
@@ -244,36 +246,33 @@ def _filter_detections_gpu(
             kpss_list (list): List of keypoint arrays (np.ndarray) from each detection angle.
             img_height (int): The *original* height of the source image.
             img_width (int): The *original* width of the source image.
-            det_scale (torch.Tensor): The scaling factor used to resize the image (new_height / original_height).
-            max_num (int): The maximum number of faces to return, sorted by size and centrality.
+            det_scale (torch.Tensor): The scaling factor used to resize the image.
+            max_num (int): The maximum number of faces to return.
             skip_nms (bool): If True, skips the Non-Maximum Suppression step.
 
         Returns:
             tuple: (det, kpss_final, score_values)
-                - det (np.ndarray): Final bounding boxes, scaled to original image size.
-                - kpss_final (np.ndarray): Final keypoints, scaled to original image size.
-                - score_values (np.ndarray): Scores for the final detections.
         """
         if not bboxes_list:
             return None, None, None
 
-        # Convert all raw detection lists to single GPU tensors.
-        scores_tensor = (
-            torch.from_numpy(np.vstack(scores_list))
-            .to(self.models_processor.device)
-            .squeeze()
+        # ----------------------------------------------------------------------
+        # 1. TENSOR CREATION & THREAD SAFETY
+        # ----------------------------------------------------------------------
+        # Direct tensor creation with 'device' forces a strict memory copy to VRAM.
+        # This prevents Race Conditions if numpy arrays are mutated by CPU threads concurrently.
+        device = self.models_processor.device
+
+        scores_tensor = torch.tensor(
+            np.vstack(scores_list), dtype=torch.float32, device=device
+        ).squeeze()
+        bboxes_tensor = torch.tensor(
+            np.vstack(bboxes_list), dtype=torch.float32, device=device
         )
-        bboxes_tensor = torch.from_numpy(np.vstack(bboxes_list)).to(
-            self.models_processor.device
+        kpss_tensor = torch.tensor(
+            np.vstack(kpss_list), dtype=torch.float32, device=device
         )
-        kpss_tensor = torch.from_numpy(np.vstack(kpss_list)).to(
-            self.models_processor.device
-        )
-
-        bboxes_tensor = torch.as_tensor(bboxes_tensor, dtype=torch.float32)
-        scores_tensor = torch.as_tensor(scores_tensor, dtype=torch.float32).reshape(-1)
 
-        # --- Validation Block to ensure tensors are well-formed before NMS ---
         if bboxes_tensor.numel() == 0:
             return None, None, None
         if bboxes_tensor.dim() == 1 and bboxes_tensor.numel() == 4:
@@ -281,81 +280,100 @@ def _filter_detections_gpu(
         if scores_tensor.dim() == 0:
             scores_tensor = scores_tensor.unsqueeze(0)
         if bboxes_tensor.size(0) != scores_tensor.size(0):
-            # Mismatch in tensor sizes, aborting.
             return None, None, None
 
-        # Ensure tensors are contiguous (optimizes NMS)
         bboxes_tensor = bboxes_tensor.contiguous()
         scores_tensor = scores_tensor.contiguous()
 
+        # ----------------------------------------------------------------------
+        # 2. NON-MAXIMUM SUPPRESSION (NMS)
+        # ----------------------------------------------------------------------
         if not skip_nms:
-            # Perform Non-Maximum Suppression on the GPU to remove overlapping boxes.
             nms_thresh = 0.4
+            # NMS must rely strictly on raw network confidence.
+            # Multiplying by area here would allow bloated bad rotations to absorb good ones.
             keep_indices = nms(bboxes_tensor, scores_tensor, iou_threshold=nms_thresh)
-
-            det_boxes, det_kpss, det_scores = (
-                bboxes_tensor[keep_indices],
-                kpss_tensor[keep_indices],
-                scores_tensor[keep_indices],
-            )
+            det_boxes = bboxes_tensor[keep_indices]
+            det_kpss = kpss_tensor[keep_indices]
+            det_scores = scores_tensor[keep_indices]
         else:
-            det_boxes, det_kpss, det_scores = (
-                bboxes_tensor,
-                kpss_tensor,
-                scores_tensor,
-            )
-
-        # Sort the remaining detections by their confidence score.
-        sorted_indices = torch.argsort(det_scores, descending=True)
-        det_boxes, det_kpss, det_scores = (
-            det_boxes[sorted_indices],
-            det_kpss[sorted_indices],
-            det_scores[sorted_indices],
-        )
-
-        # If more faces are detected than max_num, select the best ones.
+            det_boxes = bboxes_tensor
+            det_kpss = kpss_tensor
+            det_scores = scores_tensor
+
+        # ----------------------------------------------------------------------
+        # 3. GEOMETRIC SANITY CHECK (KPS Boundary Validation)
+        # ----------------------------------------------------------------------
+        # Anomalies from incorrect rotations often produce KPS coordinates that
+        # fall far outside their own bounding box. We filter these geometric impossibilities.
+        if det_boxes.shape[0] > 0:
+            box_widths = det_boxes[:, 2] - det_boxes[:, 0]
+            box_heights = det_boxes[:, 3] - det_boxes[:, 1]
+
+            # 15% tolerance margin
+            margin_x = box_widths * 0.15
+            margin_y = box_heights * 0.15
+
+            min_x = det_boxes[:, 0] - margin_x
+            min_y = det_boxes[:, 1] - margin_y
+            max_x = det_boxes[:, 2] + margin_x
+            max_y = det_boxes[:, 3] + margin_y
+
+            # Check if all 5 keypoints are within the expanded bounding box
+            valid_kps_mask = (
+                (det_kpss[:, :, 0] >= min_x.unsqueeze(1))
+                & (det_kpss[:, :, 0] <= max_x.unsqueeze(1))
+                & (det_kpss[:, :, 1] >= min_y.unsqueeze(1))
+                & (det_kpss[:, :, 1] <= max_y.unsqueeze(1))
+            ).all(dim=1)
+
+            if valid_kps_mask.any():
+                det_boxes = det_boxes[valid_kps_mask]
+                det_kpss = det_kpss[valid_kps_mask]
+                det_scores = det_scores[valid_kps_mask]
+
+        # ----------------------------------------------------------------------
+        # 4. AUTOMATIC NON-LINEAR SORTING HEURISTIC
+        # ----------------------------------------------------------------------
         if max_num > 0 and det_boxes.shape[0] > max_num:
             if det_boxes.shape[0] > 1:
-                # Score faces based on a combination of their size and proximity to the image center.
-                # This filtering happens on *unscaled* coordinates (relative to the padded detection image).
-                area = (det_boxes[:, 2] - det_boxes[:, 0]) * (
+                areas = (det_boxes[:, 2] - det_boxes[:, 0]) * (
                     det_boxes[:, 3] - det_boxes[:, 1]
                 )
-                # The old logic (img_height / det_scale) was mathematically incorrect and
-                # produced extreme values for non-standard aspect ratios (like VR videos).
-                # The correct logic is to find the center of the *active image area*
-                # on the padded canvas.
-                # new_height_on_canvas = img_height * det_scale
-                # new_width_on_canvas = img_width * det_scale
-                det_img_center_y = (img_height * det_scale) / 2.0
-                det_img_center_x = (img_width * det_scale) / 2.0
-
-                center_x = (det_boxes[:, 0] + det_boxes[:, 2]) / 2 - det_img_center_x
-                center_y = (det_boxes[:, 1] + det_boxes[:, 3]) / 2 - det_img_center_y
-
-                offset_dist_squared = center_x**2 + center_y**2
-                # This score favors large faces (area) that are close to the center
-                # (low offset_dist_squared).
-                values = area - offset_dist_squared * 2.0
-                bindex = torch.argsort(values, descending=True)[:max_num]
-                det_boxes, det_kpss, det_scores = (
-                    det_boxes[bindex],
-                    det_kpss[bindex],
-                    det_scores[bindex],
-                )
-            else:
-                bindex = torch.arange(
-                    det_boxes.shape[0], device=self.models_processor.device
-                )[:max_num]
+                areas = areas.clamp(min=1.0)
+
+                # Normalize values to prevent scale dominance
+                norm_scores = det_scores / (det_scores.max() + 1e-6)
+                norm_areas = areas / (areas.max() + 1e-6)
+
+                # Non-linear heuristic. Squaring the confidence drastically punishes
+                # uncertain faces. Only highly confident faces can leverage their 'Area'
+                # to win the sorting battle. This removes the need for manual UI strategies.
+                combined_values = (norm_scores**2) * 0.8 + (norm_areas * 0.2)
+
+                bindex = torch.argsort(combined_values, descending=True)[:max_num]
+
                 det_boxes = det_boxes[bindex]
                 det_kpss = det_kpss[bindex]
                 det_scores = det_scores[bindex]
-
-        # Transfer final results back to CPU and scale them to the original image dimensions.
+            else:
+                det_boxes = det_boxes[:max_num]
+                det_kpss = det_kpss[:max_num]
+                det_scores = det_scores[:max_num]
+        else:
+            # Standard confidence sort if below max_num
+            sorted_indices = torch.argsort(det_scores, descending=True)
+            det_boxes = det_boxes[sorted_indices]
+            det_kpss = det_kpss[sorted_indices]
+            det_scores = det_scores[sorted_indices]
+
+        # ----------------------------------------------------------------------
+        # 5. CPU TRANSFER & FINAL SCALING
+        # ----------------------------------------------------------------------
         det_scale_val = det_scale.cpu().item()
+
         det = det_boxes.cpu().numpy() / det_scale_val
         kpss_final = det_kpss.cpu().numpy() / det_scale_val
-
         score_values = det_scores.cpu().numpy()
 
         return det, kpss_final, score_values
@@ -480,17 +498,6 @@ def run_detect(
         Supports tracking via 'previous_detections'.
         """
         rotation_angles = rotation_angles or [0]
-        use_multi_rotation = len(rotation_angles) > 1
-
-        # Multi-angle detection must run the full detector path. Tracking shortcuts
-        # can reuse stale landmarks from a different orientation and destabilize
-        # swaps on upside-down faces during recording.
-        # Also force from_points=True so the secondary landmark model uses kpss-aligned
-        # crops. Without this, the bbox-only crop path (from_points=False) produces an
-        # upside-down face crop that landmark detectors cannot handle, which corrupts
-        # kpss_5 and causes wrong embeddings / ghost-face artifacts.
-        if use_multi_rotation:
-            from_points = True
 
         control = (
             control_override
diff --git a/app/processors/face_landmark_detectors.py b/app/processors/face_landmark_detectors.py
index e0cab253..9db70210 100644
--- a/app/processors/face_landmark_detectors.py
+++ b/app/processors/face_landmark_detectors.py
@@ -34,6 +34,11 @@ def _kps5_is_degenerate(kps5) -> bool:
     """
     if kps5 is None:
         return True
+
+    # Safely convert PyTorch tensors (especially on CUDA) to numpy arrays.
+    if torch.is_tensor(kps5):
+        kps5 = kps5.detach().cpu().numpy()
+
     try:
         kps5 = np.asarray(kps5, dtype=np.float32)
     except (ValueError, TypeError):
@@ -279,17 +284,27 @@ def _prepare_crop(
         Prepares a cropped and warped face image for a landmark detector.
         This helper centralizes the repetitive pre-processing logic of aligning a face
         based on either a bounding box or existing keypoints.
-
         Returns:
             Tuple[torch.Tensor, np.ndarray, np.ndarray]: The cropped image, the forward transform matrix (M),
                                                           and the inverse transform matrix (IM).
         """
+        import math
+
         if not from_points:
             # Align the face using the bounding box center and size.
             w, h = (bbox[2] - bbox[0]), (bbox[3] - bbox[1])
             center = (bbox[2] + bbox[0]) / 2, (bbox[3] + bbox[1]) / 2
             _scale = target_size / (max(w, h) * scale)
-            aimg, M = faceutil.transform(img, center, target_size, _scale, 0)
+
+            # Correct math implementation to upright tilted faces in fallback mode.
+            angle = 0.0
+            if det_kpss is not None and len(det_kpss) >= 2:
+                dx = det_kpss[1][0] - det_kpss[0][0]
+                dy = det_kpss[1][1] - det_kpss[0][1]
+                if math.hypot(dx, dy) > 1e-3:
+                    angle = math.degrees(math.atan2(-dy, dx))
+
+            aimg, M = faceutil.transform(img, center, target_size, _scale, angle)
             IM = faceutil.invertAffineTransform(M)
         else:
             if det_kpss is None or len(det_kpss) == 0:
diff --git a/app/processors/face_masks.py b/app/processors/face_masks.py
index ecf07f43..cc5abd0d 100644
--- a/app/processors/face_masks.py
+++ b/app/processors/face_masks.py
@@ -183,15 +183,31 @@ def _enhance_and_align_swapped_mouth(
             )
             enhanced_swap_img[:, ymin:ymax, xmin:xmax] = sharpened_mouth
 
-        # --- Alignment Logic (Stable Center of Mass) ---
+        # --- Alignment Logic (Statistical Anchoring) ---
         y_s_full, x_s_full = torch.where(mouth_swap)
-        y_s_inner, _ = torch.where(inner_swap)
 
-        # Use Mean (Center of Mass) instead of min/max to avoid temporal jittering
+        # X-axis centroid of the entire mouth (highly stable spatial reference)
         cx_s = x_s_full.float().mean()
-        # Keep the top of the inner mouth as Y anchor (but average the top 5% to avoid 1-pixel noise)
-        top_y_k = max(1, int(y_s_inner.shape[0] * 0.01))
-        cy_s = torch.topk(y_s_inner.float(), k=top_y_k, largest=False).values.mean()
+
+        # FW-PERF-FIX: Pure GPU control flow (CUDA Sync Avoidance)
+        # Using pure boolean tensor logic avoids CPU-GPU synchronization.
+        mask_upper = labels_swap == 12
+        has_upper = mask_upper.any()
+
+        # Dynamically select the mask: if upper lip exists, use it. Otherwise, fallback to inner_swap.
+        target_mask = mask_upper | (inner_swap & ~has_upper)
+        y_s_target, _ = torch.where(target_mask)
+
+        # FW-BUG-FIX: "Statistical Boundary" anchoring.
+        # Calculates the teeth line by averaging all pixels of the target mask.
+        mean_ys = y_s_target.float().mean()
+        std_ys = y_s_target.float().std()
+
+        # Gracefully handle NaN strictly on the GPU (e.g., if only 1 pixel is found, std is NaN)
+        std_ys = torch.nan_to_num(std_ys, nan=0.0)
+
+        # The center of the reference region + 1.5x its standard deviation = optimal teeth alignment line
+        cy_s = mean_ys + 1.5 * std_ys
 
         mouthzoom = parameters.get("MouthParserStretchDecimalSlider", 1.05)
 
@@ -306,32 +322,60 @@ def _enhance_and_align_original_mouth(
             )
             enhanced_img_orig[:, ymin:ymax, xmin:xmax] = sharpened_mouth
 
-        # --- Alignment Logic (Stable Center of Mass) ---
+        # --- Alignment Logic (Statistical Anchoring) ---
         y_o_full, x_o_full = torch.where(mouth_orig)
         y_s_full, x_s_full = torch.where(mouth_swap)
 
-        w_o = (x_o_full.max() - x_o_full.min()).float()
-        w_s = (x_s_full.max() - x_s_full.min()).float()
+        # 1. SCALE (Width Standard Deviation)
+        # Width is calculated based on spatial dispersion. Even if the mask jitters,
+        # the scale factor will remain completely stable over time.
+        std_x_o = x_o_full.float().std()
+        std_x_s = x_s_full.float().std()
 
-        if w_o <= 0.0 or w_s <= 0.0:
+        if (
+            std_x_o <= 0.0
+            or std_x_s <= 0.0
+            or torch.isnan(std_x_o)
+            or torch.isnan(std_x_s)
+        ):
             return None, None
 
+        mouthzoom = parameters.get("MouthParserStretchDecimalSlider", 1.05)
+        scale_factor = (std_x_s / std_x_o) * mouthzoom
+
+        # 2. X-AXIS CENTROID
         cx_o = x_o_full.float().mean()
         cx_s = x_s_full.float().mean()
 
-        y_anchor_orig = torch.where(inner_orig)[0]
-        y_anchor_swap = torch.where(inner_swap)[0]
+        # 3. Y-AXIS ANCHORING (Statistical Boundary of the upper lip)
+        # FW-PERF-FIX: Pure GPU control flow (CUDA Sync Avoidance)
+        # Replaced CPU-side 'len()' checks with pure boolean tensor logic.
 
-        top_o_k = max(1, int(y_anchor_orig.shape[0] * 0.01))
-        top_s_k = max(1, int(y_anchor_swap.shape[0] * 0.01))
-        cy_o = torch.topk(y_anchor_orig.float(), k=top_o_k, largest=False).values.mean()
-        cy_s = torch.topk(y_anchor_swap.float(), k=top_s_k, largest=False).values.mean()
+        # --- Original Mouth Anchoring ---
+        mask_o_upper = labels_orig == 12
+        has_o_upper = mask_o_upper.any()
 
-        mouthzoom = parameters.get("MouthParserStretchDecimalSlider", 1.05)
-        scale_factor = (w_s / w_o) * mouthzoom
+        # Fallback to the entire original mouth if no upper lip is detected
+        target_mask_o = mask_o_upper | (mouth_orig & ~has_o_upper)
+        y_o_target, _ = torch.where(target_mask_o)
 
-        if scale_factor <= 0.0:
-            return None, None
+        mean_yo = y_o_target.float().mean()
+        std_yo = y_o_target.float().std()
+        std_yo = torch.nan_to_num(std_yo, nan=0.0)
+        cy_o = mean_yo + 1.5 * std_yo
+
+        # --- Swapped Mouth Anchoring ---
+        mask_s_upper = labels_swap == 12
+        has_s_upper = mask_s_upper.any()
+
+        # Fallback to the entire swapped mouth if no upper lip is detected
+        target_mask_s = mask_s_upper | (mouth_swap & ~has_s_upper)
+        y_s_target, _ = torch.where(target_mask_s)
+
+        mean_ys = y_s_target.float().mean()
+        std_ys = y_s_target.float().std()
+        std_ys = torch.nan_to_num(std_ys, nan=0.0)
+        cy_s = mean_ys + 1.5 * std_ys
 
         translate_x = cx_s - cx_o
         translate_y = cy_s - cy_o
@@ -364,6 +408,8 @@ def _enhance_and_align_original_mouth(
             content_mask.unsqueeze(0), kernel_size=5, sigma=1.0
         ).squeeze(0)
 
+        w_s = (x_s_full.max() - x_s_full.min()).float()
+
         # 2. Destroy the fake teeth with blur (Controlled by UI Slider)
         cavity_blur_pct = parameters.get("MouthOriginalCavityBlurSlider", 15) / 100.0
         # Calculate dynamic kernel size based on mouth width and user slider
diff --git a/app/processors/video_utils/sequential_detector.py b/app/processors/video_utils/sequential_detector.py
index 8db778fa..86acbe3d 100644
--- a/app/processors/video_utils/sequential_detector.py
+++ b/app/processors/video_utils/sequential_detector.py
@@ -310,45 +310,54 @@ def run(
             # to prevent downstream ".copy()" calls from throwing AttributeError.
             kps_standard = current_kps5.copy()
 
-            # Extract standard dense landmarks (68 or 203 depending on UI selection)
-            if use_landmark:
-                _, lm_kpss, _ = self.main_window.models_processor.run_detect_landmark(
+            # FW-LOGIC-FIX 1: Extract forced 203 landmarks FIRST if advanced editing features demand it.
+            # This ensures we always have a properly aligned 203 if required, independently of UI settings.
+            kps_203_local = numpy.zeros((203, 2), dtype=numpy.float32)
+            has_valid_203 = False
+
+            if requires_203:
+                _, lm_203, _ = self.main_window.models_processor.run_detect_landmark(
                     frame_tensor,
                     current_bbox,
                     current_kps5,
-                    detect_mode=landmark_mode,
-                    score=control.get("LandmarkDetectScoreSlider", 50) / 100.0,
+                    detect_mode="203",
+                    score=0.5,
                     use_mean_eyes=control.get("LandmarkMeanEyesToggle", False),
-                    from_points=True,
+                    from_points=True,  # STRICTLY REQUIRED FOR INTERNAL ALIGNMENT
                 )
-                if len(lm_kpss) > 0:
-                    kps_standard = lm_kpss
-
-            filtered_kpss.append(kps_standard)
-
-            # Extract forced 203 landmarks if advanced editing features demand it
-            if requires_203:
-                kps_203_local = numpy.zeros((203, 2), dtype=numpy.float32)
+                if len(lm_203) > 0:
+                    kps_203_local = lm_203
+                    has_valid_203 = True
+                filtered_kpss_203.append(kps_203_local)
 
-                # Optimization: If the user already selected 203 as standard, reuse it to save a CUDA call
-                if use_landmark and landmark_mode == "203" and len(kps_standard) == 203:
-                    kps_203_local = kps_standard.copy()
+            # FW-LOGIC-FIX 2: Extract standard dense landmarks (68, 203 or 478 depending on UI selection)
+            if use_landmark:
+                # OPTIMIZATION: Reuse the 203 landmarks computed above ONLY IF
+                # the user explicitly enabled 'from_points' in the UI.
+                # This effectively prevents a redundant and costly neural network forward pass.
+                if (
+                    landmark_mode == "203"
+                    and requires_203
+                    and has_valid_203
+                    and from_points
+                ):
+                    kps_standard = kps_203_local.copy()
                 else:
-                    _, lm_203, _ = (
+                    _, lm_kpss, _ = (
                         self.main_window.models_processor.run_detect_landmark(
                             frame_tensor,
                             current_bbox,
                             current_kps5,
-                            detect_mode="203",
-                            score=0.5,
+                            detect_mode=landmark_mode,
+                            score=control.get("LandmarkDetectScoreSlider", 50) / 100.0,
                             use_mean_eyes=control.get("LandmarkMeanEyesToggle", False),
-                            from_points=True,
+                            from_points=from_points,
                         )
                     )
-                    if len(lm_203) > 0:
-                        kps_203_local = lm_203
+                    if len(lm_kpss) > 0:
+                        kps_standard = lm_kpss
 
-                filtered_kpss_203.append(kps_203_local)
+            filtered_kpss.append(kps_standard)
 
         # Reformat output arrays to match the expected pipeline signature (CRITICAL TYPE CASTING)
         bboxes = numpy.array(filtered_bboxes, dtype=numpy.float32)
diff --git a/app/processors/workers/frame_worker.py b/app/processors/workers/frame_worker.py
index de2b065e..3bd49797 100644
--- a/app/processors/workers/frame_worker.py
+++ b/app/processors/workers/frame_worker.py
@@ -238,16 +238,8 @@ def __init__(
         ).view(1, 1, 3, 3)
         self.kernel_sobel_y = self.kernel_sobel_x.transpose(2, 3)
 
-        # FW-CPU-1: Per-worker CUDA stream, otherwise all 8 pool workers share the
-        # default stream and every `current_stream().synchronize()` call inside
-        # _run_model_with_lazy_build_check (face_swappers / face_masks / face_restorers /
-        # frame_enhancers / face_landmark_detectors) waits for *every* worker's
-        # pending GPU work. With CUDA 13's spin-wait scheduler that turns each pool
-        # worker into a 100%-CPU spinner; pinning each worker to its own stream
-        # cuts the sync surface back down to what the worker itself submitted.
-        self.worker_stream = (
-            torch.cuda.Stream() if self.models_processor.device == "cuda" else None
-        )
+        # Do not use local streams here ! Onnxruntime handles independent streams internally for each worker (fixes VRAM explosion)
+        self.worker_stream = None  # (torch.cuda.Stream() if self.models_processor.device == "cuda" else None)
 
     def set_scaling_transforms(self, control_params):
         """Initializes the torchvision transforms based on user interpolation settings."""
@@ -1995,6 +1987,10 @@ def _process_frame_standard(
                             k[:, 1] *= ratio_h
 
         # Manual Rotation
+        # FW-BUG-FIX: Store pre- and post-rotation dimensions to accurately
+        # reverse the affine transform later without accumulating black borders.
+        pre_rot_h, pre_rot_w = img.shape[1], img.shape[2]
+
         if control["ManualRotationEnableToggle"]:
             img = v2.functional.rotate(
                 img,
@@ -2003,6 +1999,8 @@ def _process_frame_standard(
                 expand=True,
             )
 
+        post_rot_h, post_rot_w = img.shape[1], img.shape[2]
+
         # --- DETECTION PHASE ---
         # The workers are now "Stateless Render Engines". They no longer track time or state.
         # They consume perfectly sequenced and EMA-smoothed detections from the Feeder thread.
@@ -2053,7 +2051,10 @@ def _process_frame_standard(
                         requires_203 = True
                         break
 
-            # STEP 1: Standard detection respecting User's choice
+            # --- STEP 1: Standard detection (Respect UI Toggle) ---
+            # We pass 'use_landmark_detection=use_landmark' to allow run_detect
+            # to attempt an initial extraction. If Auto-Rotation is enabled,
+            # run_detect may bypass dense landmark extraction if the angle is too extreme.
             bboxes, kpss_5, kpss = self.models_processor.run_detect(
                 img,
                 control.get("DetectorModelSelection", "RetinaFace"),
@@ -2063,7 +2064,7 @@ def _process_frame_standard(
                 use_landmark_detection=use_landmark,
                 landmark_detect_mode=landmark_mode,
                 landmark_score=control.get("LandmarkDetectScoreSlider", 50) / 100.0,
-                from_points=from_points,
+                from_points=from_points,  # Respects the UI toggle state
                 rotation_angles=[0]
                 if not control.get("AutoRotationToggle", False)
                 else [0, 90, 180, 270],
@@ -2072,32 +2073,87 @@ def _process_frame_standard(
                 bypass_bytetrack=True,
             )
 
-            # STEP 2: Smart Double-Scan for 203 points
+            # FW-LOGIC-FIX: Validate if Step 1 returned actual dense landmarks (> 5 points).
+            # If run_detect aborted dense extraction due to an extreme rotation angle,
+            # 'kpss' will merely contain a fallback copy of the sparse 'kpss_5'.
+            has_valid_dense_kpss = (
+                kpss is not None
+                and len(kpss) == len(bboxes)
+                and len(bboxes) > 0
+                and kpss[0].shape[0] > 5
+            )
+
+            # --- STEP 2: 203-Landmark Extraction (Expression Restorer / Face Editor) ---
             kpss_203 = None
             if requires_203:
-                if use_landmark and landmark_mode == "203":
-                    # OPTIMIZATION: 203 was already extracted by the user's choice. Zero CUDA cost.
-                    kpss_203 = kpss
-                else:
-                    # Extract 203 specifically for advanced features
-                    _, _, kpss_203 = self.models_processor.run_detect(
-                        img,
-                        control.get("DetectorModelSelection", "RetinaFace"),
-                        max_num=control.get("MaxFacesToDetectSlider", 1),
-                        score=control.get("DetectorScoreSlider", 50) / 100.0,
-                        input_size=(512, 512),
-                        use_landmark_detection=True,
-                        landmark_detect_mode="203",
-                        landmark_score=control.get("LandmarkDetectScoreSlider", 50)
-                        / 100.0,
-                        from_points=from_points,
-                        rotation_angles=[0]
-                        if not control.get("AutoRotationToggle", False)
-                        else [0, 90, 180, 270],
-                        use_mean_eyes=control.get("LandmarkMeanEyesToggle", False),
-                        previous_detections=None,
-                        bypass_bytetrack=True,
-                    )
+                kpss_203_list = []
+                # LOGIC FIX: We ONLY reuse Step 1 landmarks if they are already in the 203
+                # format AND the user enabled 'from_points'. Otherwise, we MUST re-extract
+                # with 'from_points=True' to ensure geometric alignment for the Expression Restorer.
+                can_reuse_step1_203 = (
+                    has_valid_dense_kpss and landmark_mode == "203" and from_points
+                )
+
+                if bboxes is not None and len(bboxes) > 0:
+                    for idx in range(len(bboxes)):
+                        if can_reuse_step1_203:
+                            kps_203_local = kpss[idx].copy()
+                        else:
+                            # FORCE from_points=True: Strictly required for the geometric
+                            # alignment of advanced tools (FaceEditor, Makeup, Expressions).
+                            _, lm_203, _ = self.models_processor.run_detect_landmark(
+                                img,
+                                bboxes[idx],
+                                kpss_5[idx],
+                                detect_mode="203",
+                                score=0.5,
+                                use_mean_eyes=control.get(
+                                    "LandmarkMeanEyesToggle", False
+                                ),
+                                from_points=True,  # STRICTLY REQUIRED HERE
+                            )
+                            kps_203_local = (
+                                lm_203
+                                if len(lm_203) > 0
+                                else np.zeros((203, 2), dtype=np.float32)
+                            )
+                        kpss_203_list.append(kps_203_local)
+                kpss_203 = np.array(kpss_203_list, dtype=object)
+
+            # --- STEP 3: Fallback for standard landmarks (UI Display) ---
+            if (
+                use_landmark
+                and not has_valid_dense_kpss
+                and bboxes is not None
+                and len(bboxes) > 0
+            ):
+                kpss_list = []
+                for idx in range(len(bboxes)):
+                    # Smart reuse: If Step 2 just computed 203 landmarks, use them
+                    # to avoid a redundant neural network forward pass.
+                    if (
+                        landmark_mode == "203"
+                        and kpss_203 is not None
+                        and len(kpss_203) > idx
+                    ):
+                        kpss_list.append(kpss_203[idx])
+                    else:
+                        # Respects the UI toggle state for standard UI landmarks
+                        _, lm_std, _ = self.models_processor.run_detect_landmark(
+                            img,
+                            bboxes[idx],
+                            kpss_5[idx],
+                            detect_mode=landmark_mode,
+                            score=control.get("LandmarkDetectScoreSlider", 50) / 100.0,
+                            use_mean_eyes=control.get("LandmarkMeanEyesToggle", False),
+                            from_points=from_points,
+                        )
+                        kpss_list.append(
+                            lm_std
+                            if len(lm_std) > 0
+                            else np.zeros((int(landmark_mode), 2), dtype=np.float32)
+                        )
+                kpss = np.array(kpss_list, dtype=object)
 
         if (
             isinstance(kpss_5, np.ndarray)
@@ -2422,12 +2478,71 @@ def _process_frame_standard(
 
         # Undo Rotation / Scaling
         if control["ManualRotationEnableToggle"]:
+            angle = control["ManualRotationAngleSlider"]
+
+            # FW-BUG-FIX 1: Reverse rotation WITH expand=True so the canvas can physically
+            # accommodate the restored dimensions (fixes the 90/270 degree crop).
+            # Then center crop to strictly restore the original tensor shape.
             img = v2.functional.rotate(
                 img,
-                angle=-control["ManualRotationAngleSlider"],
+                angle=-angle,
                 interpolation=v2.InterpolationMode.BILINEAR,
-                expand=True,
+                expand=True,  # CRITICAL: Was False, causing the image to be truncated
             )
+            img = v2.functional.center_crop(img, (pre_rot_h, pre_rot_w))
+
+            # FW-MATH: Reverse the affine transform on all spatial coordinates.
+            import math
+
+            rad = math.radians(angle)
+            cos_a, sin_a = math.cos(rad), math.sin(rad)
+
+            cx_orig, cy_orig = pre_rot_w / 2.0, pre_rot_h / 2.0
+            cx_rot, cy_rot = post_rot_w / 2.0, post_rot_h / 2.0
+
+            for fface in det_faces_data_for_display:
+                if fface.get("_rotation_rescaled"):
+                    continue
+
+                def unrotate_pts(pts):
+                    if pts is None or len(pts) == 0:
+                        return pts
+                    # 1. Move origin to the rotated center
+                    x0 = pts[:, 0] - cx_rot
+                    y0 = pts[:, 1] - cy_rot
+
+                    # 2. Apply 2D inverse rotation matrix (Clockwise in Y-down coordinate system)
+                    # BUG FIX: The previous signs were inverted, causing points to rotate further away!
+                    x1 = x0 * cos_a - y0 * sin_a
+                    y1 = x0 * sin_a + y0 * cos_a
+
+                    # 3. Move back to the original image center
+                    pts[:, 0] = x1 + cx_orig
+                    pts[:, 1] = y1 + cy_orig
+                    return pts
+
+                # Bounding box requires converting to 4 corners, un-rotating, and getting min/max bounds
+                if fface.get("bbox") is not None:
+                    x1, y1, x2, y2 = fface["bbox"]
+                    corners = np.array(
+                        [[x1, y1], [x2, y1], [x2, y2], [x1, y2]], dtype=np.float32
+                    )
+                    unrot_corners = unrotate_pts(corners)
+                    fface["bbox"][0] = np.min(unrot_corners[:, 0])
+                    fface["bbox"][1] = np.min(unrot_corners[:, 1])
+                    fface["bbox"][2] = np.max(unrot_corners[:, 0])
+                    fface["bbox"][3] = np.max(unrot_corners[:, 1])
+
+                # Un-rotate all Keypoint Arrays
+                if fface.get("kps_5") is not None:
+                    fface["kps_5"] = unrotate_pts(fface["kps_5"])
+                if fface.get("kps_all") is not None:
+                    fface["kps_all"] = unrotate_pts(fface["kps_all"])
+                if fface.get("kps_203") is not None:
+                    fface["kps_203"] = unrotate_pts(fface["kps_203"])
+
+                fface["_rotation_rescaled"] = True
+
         if scale_applied:
             # FW-QUAL-11: use renamed img_h/img_w variables
             # FW-PERF-08 / FW-MEM-02: LRU-bounded cache for the scale-back transform.
diff --git a/app/ui/launcher/gittools.py b/app/ui/launcher/gittools.py
index f975293a..b4e93305 100644
--- a/app/ui/launcher/gittools.py
+++ b/app/ui/launcher/gittools.py
@@ -20,9 +20,7 @@
 from PySide6.QtWidgets import QApplication, QMessageBox
 
 
-RELEASE_BAT_URL = (
-    "https://github.com/VisoMasterFusion/VisoMaster-Fusion/releases/latest/download/Start_Portable.bat"
-)
+RELEASE_BAT_URL = "https://github.com/VisoMasterFusion/VisoMaster-Fusion/releases/latest/download/Start_Portable.bat"
 _remote_bat_cache: Path | None = None
 
 
diff --git a/app/ui/widgets/actions/save_load_actions.py b/app/ui/widgets/actions/save_load_actions.py
index dbd8fd27..be3e2541 100644
--- a/app/ui/widgets/actions/save_load_actions.py
+++ b/app/ui/widgets/actions/save_load_actions.py
@@ -483,7 +483,11 @@ def load_saved_workspace(
 
             # OPTIMIZED: Force PySide6 to process the pending 'thumbnail_ready' signals
             # before continuing, ensuring UI elements are fully instantiated.
-            QtWidgets.QApplication.processEvents()
+            while list_view_actions._has_pending_target_media_thumbnail_work(
+                main_window
+            ):
+                list_view_actions._flush_target_media_thumbnail_batch(main_window)
+                QtWidgets.QApplication.processEvents()
 
             # Select target media (Secured with .get to prevent KeyError on older workspaces)
             selected_media_id = data.get("selected_media_id", False)
@@ -779,16 +783,18 @@ def load_saved_workspace(
                     card_actions.uncheck_all_merged_embeddings(main_window)
 
                     for input_face_id in first_face_button.assigned_input_faces.keys():
-                        input_face_button = main_window.input_faces.get(input_face_id)
-                        if input_face_button:
-                            input_face_button.setChecked(True)
+                        assigned_input_btn = main_window.input_faces.get(input_face_id)
+                        if assigned_input_btn:
+                            assigned_input_btn.setChecked(True)
 
                     for (
                         embedding_id
                     ) in first_face_button.assigned_merged_embeddings.keys():
-                        embed_button = main_window.merged_embeddings.get(embedding_id)
-                        if embed_button:
-                            embed_button.setChecked(True)
+                        assigned_embed_btn = main_window.merged_embeddings.get(
+                            embedding_id
+                        )
+                        if assigned_embed_btn:
+                            assigned_embed_btn.setChecked(True)
 
                     main_window.current_kv_tensors_map = getattr(
                         first_face_button, "assigned_kv_map", None
diff --git a/app/ui/widgets/denoiser_layout_data.py b/app/ui/widgets/denoiser_layout_data.py
index 35775f21..b4bd3a59 100644
--- a/app/ui/widgets/denoiser_layout_data.py
+++ b/app/ui/widgets/denoiser_layout_data.py
@@ -18,7 +18,7 @@
             "widget_type": "ParameterSlider",
             "label": "Base Seed",
             "control_name": "DenoiserBaseSeedSlider",
-            "min_value": "1",
+            "min_value": "0",  # 0 is a valide value for the denoiser
             "max_value": "999",
             "default": "220",
             "step": 1,
diff --git a/app/ui/widgets/swapper_layout_data.py b/app/ui/widgets/swapper_layout_data.py
index 8ab62011..63d68488 100644
--- a/app/ui/widgets/swapper_layout_data.py
+++ b/app/ui/widgets/swapper_layout_data.py
@@ -188,7 +188,7 @@
         "BordermaskEnableToggle": {
             "level": 1,
             "label": "Border Mask",
-            "default": False,
+            "default": True,  # Default to True to prevent black square around the swap when no occlusion is selected
             "help": "A rectangle with adjustable bottom, left, right, top, and sides that masks the swapped face result back into the original image.",
         },
         "BorderBottomSlider": {
diff --git a/app/ui/widgets/ui_workers.py b/app/ui/widgets/ui_workers.py
index b187cbf0..0d9edcea 100644
--- a/app/ui/widgets/ui_workers.py
+++ b/app/ui/widgets/ui_workers.py
@@ -46,6 +46,9 @@ def __init__(
         self.sort_files_list_by_name = sort_files_list_by_name
         self.webcam_mode = webcam_mode
         self._running = True  # Flag to control the running state
+        self.control_snapshot = (
+            main_window.control.copy() if getattr(main_window, "control", None) else {}
+        )
 
     def run(self):
         if self.folder_name:
@@ -69,7 +72,7 @@ def load_videos_and_images_from_folder(self, folder_name):
         self.main_window.placeholder_update_signal.emit(
             self.main_window.targetVideosList, True
         )
-        recursive_toggle = self.main_window.control.get(
+        recursive_toggle = self.control_snapshot.get(
             "TargetMediaFolderRecursiveToggle", False
         )
 
@@ -92,7 +95,7 @@ def load_videos_and_images_from_folder(self, folder_name):
                 self.main_window,
                 media_file_path,
                 file_type,
-                cache_thumbnail=False,
+                cache_thumbnail=True,
             )
 
             media_id = self.media_ids[i] if self.media_ids else str(uuid.uuid1().int)
@@ -131,7 +134,7 @@ def load_videos_and_images_from_files_list(self, files_list):
                 self.main_window,
                 media_file_path,
                 file_type=file_type,
-                cache_thumbnail=False,
+                cache_thumbnail=True,
             )
             if q_image:
                 # Emit the signal to update GUI
@@ -146,9 +149,9 @@ def load_webcams(self):
             self.main_window.targetVideosList, True
         )
         camera_backend = CAMERA_BACKENDS[
-            self.main_window.control.get("WebcamBackendSelection", "DirectShow")
+            self.control_snapshot.get("WebcamBackendSelection", "DirectShow")
         ]
-        max_no = int(self.main_window.control.get("WebcamMaxNoSelection", 1))
+        max_no = int(self.control_snapshot.get("WebcamMaxNoSelection", 1))
 
         for i in range(max_no):
             try:
diff --git a/launcher.py b/launcher.py
index a331370c..cc8537d9 100644
--- a/launcher.py
+++ b/launcher.py
@@ -28,7 +28,7 @@
 if str(_repo_root) not in sys.path:
     sys.path.insert(0, str(_repo_root))
 
-from app.ui.launcher.main import main
+from app.ui.launcher.main import main  # noqa: E402
 
 if __name__ == "__main__":
     main()