diff --git a/app/helpers/miscellaneous.py b/app/helpers/miscellaneous.py index 0091fdb9..8582f939 100644 --- a/app/helpers/miscellaneous.py +++ b/app/helpers/miscellaneous.py @@ -1151,14 +1151,26 @@ def keypoints_adjustments( # --- MANUAL ALIGNMENTS (Sliders) --- if parameters.get("FaceAdjEnableToggle", False): - kps_5_adj[:, 0] += parameters["KpsXSlider"] - kps_5_adj[:, 1] += parameters["KpsYSlider"] - kps_5_adj[:, 0] -= 255 - kps_5_adj[:, 0] *= 1 + parameters["KpsScaleSlider"] / 100.0 - kps_5_adj[:, 0] += 255 - kps_5_adj[:, 1] -= 255 - kps_5_adj[:, 1] *= 1 + parameters["KpsScaleSlider"] / 100.0 - kps_5_adj[:, 1] += 255 + # 1. Apply spatial translations (X / Y Axis) + # Adjusts the facial keypoints position based on user-defined offsets. + kps_5_adj[:, 0] += parameters.get("KpsXSlider", 0.0) + kps_5_adj[:, 1] += parameters.get("KpsYSlider", 0.0) + + # 2. Apply spatial scaling + # Resizes the face representation while maintaining its relative geometry. + scale_val = parameters.get("KpsScaleSlider", 0.0) + if scale_val != 0.0: + scale_factor = 1.0 + (scale_val / 100.0) + + # FW-BUG-FIX: Dynamic Centroid Calculation. + # Replaced the hardcoded '255' center with the actual barycenter + # of the face keypoints. This prevents unwanted translation (drift) + # when resizing faces that are not perfectly centered at (255, 255). + centroid = np.mean(kps_5_adj, axis=0) # Returns array([mean_x, mean_y]) + + # Vectorized scaling: (Point - Centroid) * Scale + Centroid + # Computes both X and Y axes simultaneously for optimal NumPy performance. + kps_5_adj = (kps_5_adj - centroid) * scale_factor + centroid if ( parameters.get("LandmarksPositionAdjEnableToggle", False) diff --git a/app/processors/face_detectors.py b/app/processors/face_detectors.py index d81d295d..e75d210d 100644 --- a/app/processors/face_detectors.py +++ b/app/processors/face_detectors.py @@ -236,7 +236,9 @@ def _filter_detections_gpu( skip_nms=False, ): """ - Performs GPU-accelerated NMS, sorting, and filtering on raw detections from all angles. + Performs GPU-accelerated NMS, automatic heuristic sorting, and filtering on raw detections. + Designed for fully automated pipelines: filters out bad rotation artifacts via geometric + sanity checks and elects the best candidates using a non-linear Confidence/Area heuristic. Args: scores_list (list): List of score arrays (np.ndarray) from each detection angle. @@ -244,36 +246,33 @@ def _filter_detections_gpu( kpss_list (list): List of keypoint arrays (np.ndarray) from each detection angle. img_height (int): The *original* height of the source image. img_width (int): The *original* width of the source image. - det_scale (torch.Tensor): The scaling factor used to resize the image (new_height / original_height). - max_num (int): The maximum number of faces to return, sorted by size and centrality. + det_scale (torch.Tensor): The scaling factor used to resize the image. + max_num (int): The maximum number of faces to return. skip_nms (bool): If True, skips the Non-Maximum Suppression step. Returns: tuple: (det, kpss_final, score_values) - - det (np.ndarray): Final bounding boxes, scaled to original image size. - - kpss_final (np.ndarray): Final keypoints, scaled to original image size. - - score_values (np.ndarray): Scores for the final detections. """ if not bboxes_list: return None, None, None - # Convert all raw detection lists to single GPU tensors. - scores_tensor = ( - torch.from_numpy(np.vstack(scores_list)) - .to(self.models_processor.device) - .squeeze() + # ---------------------------------------------------------------------- + # 1. TENSOR CREATION & THREAD SAFETY + # ---------------------------------------------------------------------- + # Direct tensor creation with 'device' forces a strict memory copy to VRAM. + # This prevents Race Conditions if numpy arrays are mutated by CPU threads concurrently. + device = self.models_processor.device + + scores_tensor = torch.tensor( + np.vstack(scores_list), dtype=torch.float32, device=device + ).squeeze() + bboxes_tensor = torch.tensor( + np.vstack(bboxes_list), dtype=torch.float32, device=device ) - bboxes_tensor = torch.from_numpy(np.vstack(bboxes_list)).to( - self.models_processor.device + kpss_tensor = torch.tensor( + np.vstack(kpss_list), dtype=torch.float32, device=device ) - kpss_tensor = torch.from_numpy(np.vstack(kpss_list)).to( - self.models_processor.device - ) - - bboxes_tensor = torch.as_tensor(bboxes_tensor, dtype=torch.float32) - scores_tensor = torch.as_tensor(scores_tensor, dtype=torch.float32).reshape(-1) - # --- Validation Block to ensure tensors are well-formed before NMS --- if bboxes_tensor.numel() == 0: return None, None, None if bboxes_tensor.dim() == 1 and bboxes_tensor.numel() == 4: @@ -281,81 +280,100 @@ def _filter_detections_gpu( if scores_tensor.dim() == 0: scores_tensor = scores_tensor.unsqueeze(0) if bboxes_tensor.size(0) != scores_tensor.size(0): - # Mismatch in tensor sizes, aborting. return None, None, None - # Ensure tensors are contiguous (optimizes NMS) bboxes_tensor = bboxes_tensor.contiguous() scores_tensor = scores_tensor.contiguous() + # ---------------------------------------------------------------------- + # 2. NON-MAXIMUM SUPPRESSION (NMS) + # ---------------------------------------------------------------------- if not skip_nms: - # Perform Non-Maximum Suppression on the GPU to remove overlapping boxes. nms_thresh = 0.4 + # NMS must rely strictly on raw network confidence. + # Multiplying by area here would allow bloated bad rotations to absorb good ones. keep_indices = nms(bboxes_tensor, scores_tensor, iou_threshold=nms_thresh) - - det_boxes, det_kpss, det_scores = ( - bboxes_tensor[keep_indices], - kpss_tensor[keep_indices], - scores_tensor[keep_indices], - ) + det_boxes = bboxes_tensor[keep_indices] + det_kpss = kpss_tensor[keep_indices] + det_scores = scores_tensor[keep_indices] else: - det_boxes, det_kpss, det_scores = ( - bboxes_tensor, - kpss_tensor, - scores_tensor, - ) - - # Sort the remaining detections by their confidence score. - sorted_indices = torch.argsort(det_scores, descending=True) - det_boxes, det_kpss, det_scores = ( - det_boxes[sorted_indices], - det_kpss[sorted_indices], - det_scores[sorted_indices], - ) - - # If more faces are detected than max_num, select the best ones. + det_boxes = bboxes_tensor + det_kpss = kpss_tensor + det_scores = scores_tensor + + # ---------------------------------------------------------------------- + # 3. GEOMETRIC SANITY CHECK (KPS Boundary Validation) + # ---------------------------------------------------------------------- + # Anomalies from incorrect rotations often produce KPS coordinates that + # fall far outside their own bounding box. We filter these geometric impossibilities. + if det_boxes.shape[0] > 0: + box_widths = det_boxes[:, 2] - det_boxes[:, 0] + box_heights = det_boxes[:, 3] - det_boxes[:, 1] + + # 15% tolerance margin + margin_x = box_widths * 0.15 + margin_y = box_heights * 0.15 + + min_x = det_boxes[:, 0] - margin_x + min_y = det_boxes[:, 1] - margin_y + max_x = det_boxes[:, 2] + margin_x + max_y = det_boxes[:, 3] + margin_y + + # Check if all 5 keypoints are within the expanded bounding box + valid_kps_mask = ( + (det_kpss[:, :, 0] >= min_x.unsqueeze(1)) + & (det_kpss[:, :, 0] <= max_x.unsqueeze(1)) + & (det_kpss[:, :, 1] >= min_y.unsqueeze(1)) + & (det_kpss[:, :, 1] <= max_y.unsqueeze(1)) + ).all(dim=1) + + if valid_kps_mask.any(): + det_boxes = det_boxes[valid_kps_mask] + det_kpss = det_kpss[valid_kps_mask] + det_scores = det_scores[valid_kps_mask] + + # ---------------------------------------------------------------------- + # 4. AUTOMATIC NON-LINEAR SORTING HEURISTIC + # ---------------------------------------------------------------------- if max_num > 0 and det_boxes.shape[0] > max_num: if det_boxes.shape[0] > 1: - # Score faces based on a combination of their size and proximity to the image center. - # This filtering happens on *unscaled* coordinates (relative to the padded detection image). - area = (det_boxes[:, 2] - det_boxes[:, 0]) * ( + areas = (det_boxes[:, 2] - det_boxes[:, 0]) * ( det_boxes[:, 3] - det_boxes[:, 1] ) - # The old logic (img_height / det_scale) was mathematically incorrect and - # produced extreme values for non-standard aspect ratios (like VR videos). - # The correct logic is to find the center of the *active image area* - # on the padded canvas. - # new_height_on_canvas = img_height * det_scale - # new_width_on_canvas = img_width * det_scale - det_img_center_y = (img_height * det_scale) / 2.0 - det_img_center_x = (img_width * det_scale) / 2.0 - - center_x = (det_boxes[:, 0] + det_boxes[:, 2]) / 2 - det_img_center_x - center_y = (det_boxes[:, 1] + det_boxes[:, 3]) / 2 - det_img_center_y - - offset_dist_squared = center_x**2 + center_y**2 - # This score favors large faces (area) that are close to the center - # (low offset_dist_squared). - values = area - offset_dist_squared * 2.0 - bindex = torch.argsort(values, descending=True)[:max_num] - det_boxes, det_kpss, det_scores = ( - det_boxes[bindex], - det_kpss[bindex], - det_scores[bindex], - ) - else: - bindex = torch.arange( - det_boxes.shape[0], device=self.models_processor.device - )[:max_num] + areas = areas.clamp(min=1.0) + + # Normalize values to prevent scale dominance + norm_scores = det_scores / (det_scores.max() + 1e-6) + norm_areas = areas / (areas.max() + 1e-6) + + # Non-linear heuristic. Squaring the confidence drastically punishes + # uncertain faces. Only highly confident faces can leverage their 'Area' + # to win the sorting battle. This removes the need for manual UI strategies. + combined_values = (norm_scores**2) * 0.8 + (norm_areas * 0.2) + + bindex = torch.argsort(combined_values, descending=True)[:max_num] + det_boxes = det_boxes[bindex] det_kpss = det_kpss[bindex] det_scores = det_scores[bindex] - - # Transfer final results back to CPU and scale them to the original image dimensions. + else: + det_boxes = det_boxes[:max_num] + det_kpss = det_kpss[:max_num] + det_scores = det_scores[:max_num] + else: + # Standard confidence sort if below max_num + sorted_indices = torch.argsort(det_scores, descending=True) + det_boxes = det_boxes[sorted_indices] + det_kpss = det_kpss[sorted_indices] + det_scores = det_scores[sorted_indices] + + # ---------------------------------------------------------------------- + # 5. CPU TRANSFER & FINAL SCALING + # ---------------------------------------------------------------------- det_scale_val = det_scale.cpu().item() + det = det_boxes.cpu().numpy() / det_scale_val kpss_final = det_kpss.cpu().numpy() / det_scale_val - score_values = det_scores.cpu().numpy() return det, kpss_final, score_values @@ -480,17 +498,6 @@ def run_detect( Supports tracking via 'previous_detections'. """ rotation_angles = rotation_angles or [0] - use_multi_rotation = len(rotation_angles) > 1 - - # Multi-angle detection must run the full detector path. Tracking shortcuts - # can reuse stale landmarks from a different orientation and destabilize - # swaps on upside-down faces during recording. - # Also force from_points=True so the secondary landmark model uses kpss-aligned - # crops. Without this, the bbox-only crop path (from_points=False) produces an - # upside-down face crop that landmark detectors cannot handle, which corrupts - # kpss_5 and causes wrong embeddings / ghost-face artifacts. - if use_multi_rotation: - from_points = True control = ( control_override diff --git a/app/processors/face_landmark_detectors.py b/app/processors/face_landmark_detectors.py index e0cab253..9db70210 100644 --- a/app/processors/face_landmark_detectors.py +++ b/app/processors/face_landmark_detectors.py @@ -34,6 +34,11 @@ def _kps5_is_degenerate(kps5) -> bool: """ if kps5 is None: return True + + # Safely convert PyTorch tensors (especially on CUDA) to numpy arrays. + if torch.is_tensor(kps5): + kps5 = kps5.detach().cpu().numpy() + try: kps5 = np.asarray(kps5, dtype=np.float32) except (ValueError, TypeError): @@ -279,17 +284,27 @@ def _prepare_crop( Prepares a cropped and warped face image for a landmark detector. This helper centralizes the repetitive pre-processing logic of aligning a face based on either a bounding box or existing keypoints. - Returns: Tuple[torch.Tensor, np.ndarray, np.ndarray]: The cropped image, the forward transform matrix (M), and the inverse transform matrix (IM). """ + import math + if not from_points: # Align the face using the bounding box center and size. w, h = (bbox[2] - bbox[0]), (bbox[3] - bbox[1]) center = (bbox[2] + bbox[0]) / 2, (bbox[3] + bbox[1]) / 2 _scale = target_size / (max(w, h) * scale) - aimg, M = faceutil.transform(img, center, target_size, _scale, 0) + + # Correct math implementation to upright tilted faces in fallback mode. + angle = 0.0 + if det_kpss is not None and len(det_kpss) >= 2: + dx = det_kpss[1][0] - det_kpss[0][0] + dy = det_kpss[1][1] - det_kpss[0][1] + if math.hypot(dx, dy) > 1e-3: + angle = math.degrees(math.atan2(-dy, dx)) + + aimg, M = faceutil.transform(img, center, target_size, _scale, angle) IM = faceutil.invertAffineTransform(M) else: if det_kpss is None or len(det_kpss) == 0: diff --git a/app/processors/face_masks.py b/app/processors/face_masks.py index ecf07f43..cc5abd0d 100644 --- a/app/processors/face_masks.py +++ b/app/processors/face_masks.py @@ -183,15 +183,31 @@ def _enhance_and_align_swapped_mouth( ) enhanced_swap_img[:, ymin:ymax, xmin:xmax] = sharpened_mouth - # --- Alignment Logic (Stable Center of Mass) --- + # --- Alignment Logic (Statistical Anchoring) --- y_s_full, x_s_full = torch.where(mouth_swap) - y_s_inner, _ = torch.where(inner_swap) - # Use Mean (Center of Mass) instead of min/max to avoid temporal jittering + # X-axis centroid of the entire mouth (highly stable spatial reference) cx_s = x_s_full.float().mean() - # Keep the top of the inner mouth as Y anchor (but average the top 5% to avoid 1-pixel noise) - top_y_k = max(1, int(y_s_inner.shape[0] * 0.01)) - cy_s = torch.topk(y_s_inner.float(), k=top_y_k, largest=False).values.mean() + + # FW-PERF-FIX: Pure GPU control flow (CUDA Sync Avoidance) + # Using pure boolean tensor logic avoids CPU-GPU synchronization. + mask_upper = labels_swap == 12 + has_upper = mask_upper.any() + + # Dynamically select the mask: if upper lip exists, use it. Otherwise, fallback to inner_swap. + target_mask = mask_upper | (inner_swap & ~has_upper) + y_s_target, _ = torch.where(target_mask) + + # FW-BUG-FIX: "Statistical Boundary" anchoring. + # Calculates the teeth line by averaging all pixels of the target mask. + mean_ys = y_s_target.float().mean() + std_ys = y_s_target.float().std() + + # Gracefully handle NaN strictly on the GPU (e.g., if only 1 pixel is found, std is NaN) + std_ys = torch.nan_to_num(std_ys, nan=0.0) + + # The center of the reference region + 1.5x its standard deviation = optimal teeth alignment line + cy_s = mean_ys + 1.5 * std_ys mouthzoom = parameters.get("MouthParserStretchDecimalSlider", 1.05) @@ -306,32 +322,60 @@ def _enhance_and_align_original_mouth( ) enhanced_img_orig[:, ymin:ymax, xmin:xmax] = sharpened_mouth - # --- Alignment Logic (Stable Center of Mass) --- + # --- Alignment Logic (Statistical Anchoring) --- y_o_full, x_o_full = torch.where(mouth_orig) y_s_full, x_s_full = torch.where(mouth_swap) - w_o = (x_o_full.max() - x_o_full.min()).float() - w_s = (x_s_full.max() - x_s_full.min()).float() + # 1. SCALE (Width Standard Deviation) + # Width is calculated based on spatial dispersion. Even if the mask jitters, + # the scale factor will remain completely stable over time. + std_x_o = x_o_full.float().std() + std_x_s = x_s_full.float().std() - if w_o <= 0.0 or w_s <= 0.0: + if ( + std_x_o <= 0.0 + or std_x_s <= 0.0 + or torch.isnan(std_x_o) + or torch.isnan(std_x_s) + ): return None, None + mouthzoom = parameters.get("MouthParserStretchDecimalSlider", 1.05) + scale_factor = (std_x_s / std_x_o) * mouthzoom + + # 2. X-AXIS CENTROID cx_o = x_o_full.float().mean() cx_s = x_s_full.float().mean() - y_anchor_orig = torch.where(inner_orig)[0] - y_anchor_swap = torch.where(inner_swap)[0] + # 3. Y-AXIS ANCHORING (Statistical Boundary of the upper lip) + # FW-PERF-FIX: Pure GPU control flow (CUDA Sync Avoidance) + # Replaced CPU-side 'len()' checks with pure boolean tensor logic. - top_o_k = max(1, int(y_anchor_orig.shape[0] * 0.01)) - top_s_k = max(1, int(y_anchor_swap.shape[0] * 0.01)) - cy_o = torch.topk(y_anchor_orig.float(), k=top_o_k, largest=False).values.mean() - cy_s = torch.topk(y_anchor_swap.float(), k=top_s_k, largest=False).values.mean() + # --- Original Mouth Anchoring --- + mask_o_upper = labels_orig == 12 + has_o_upper = mask_o_upper.any() - mouthzoom = parameters.get("MouthParserStretchDecimalSlider", 1.05) - scale_factor = (w_s / w_o) * mouthzoom + # Fallback to the entire original mouth if no upper lip is detected + target_mask_o = mask_o_upper | (mouth_orig & ~has_o_upper) + y_o_target, _ = torch.where(target_mask_o) - if scale_factor <= 0.0: - return None, None + mean_yo = y_o_target.float().mean() + std_yo = y_o_target.float().std() + std_yo = torch.nan_to_num(std_yo, nan=0.0) + cy_o = mean_yo + 1.5 * std_yo + + # --- Swapped Mouth Anchoring --- + mask_s_upper = labels_swap == 12 + has_s_upper = mask_s_upper.any() + + # Fallback to the entire swapped mouth if no upper lip is detected + target_mask_s = mask_s_upper | (mouth_swap & ~has_s_upper) + y_s_target, _ = torch.where(target_mask_s) + + mean_ys = y_s_target.float().mean() + std_ys = y_s_target.float().std() + std_ys = torch.nan_to_num(std_ys, nan=0.0) + cy_s = mean_ys + 1.5 * std_ys translate_x = cx_s - cx_o translate_y = cy_s - cy_o @@ -364,6 +408,8 @@ def _enhance_and_align_original_mouth( content_mask.unsqueeze(0), kernel_size=5, sigma=1.0 ).squeeze(0) + w_s = (x_s_full.max() - x_s_full.min()).float() + # 2. Destroy the fake teeth with blur (Controlled by UI Slider) cavity_blur_pct = parameters.get("MouthOriginalCavityBlurSlider", 15) / 100.0 # Calculate dynamic kernel size based on mouth width and user slider diff --git a/app/processors/video_utils/sequential_detector.py b/app/processors/video_utils/sequential_detector.py index 8db778fa..86acbe3d 100644 --- a/app/processors/video_utils/sequential_detector.py +++ b/app/processors/video_utils/sequential_detector.py @@ -310,45 +310,54 @@ def run( # to prevent downstream ".copy()" calls from throwing AttributeError. kps_standard = current_kps5.copy() - # Extract standard dense landmarks (68 or 203 depending on UI selection) - if use_landmark: - _, lm_kpss, _ = self.main_window.models_processor.run_detect_landmark( + # FW-LOGIC-FIX 1: Extract forced 203 landmarks FIRST if advanced editing features demand it. + # This ensures we always have a properly aligned 203 if required, independently of UI settings. + kps_203_local = numpy.zeros((203, 2), dtype=numpy.float32) + has_valid_203 = False + + if requires_203: + _, lm_203, _ = self.main_window.models_processor.run_detect_landmark( frame_tensor, current_bbox, current_kps5, - detect_mode=landmark_mode, - score=control.get("LandmarkDetectScoreSlider", 50) / 100.0, + detect_mode="203", + score=0.5, use_mean_eyes=control.get("LandmarkMeanEyesToggle", False), - from_points=True, + from_points=True, # STRICTLY REQUIRED FOR INTERNAL ALIGNMENT ) - if len(lm_kpss) > 0: - kps_standard = lm_kpss - - filtered_kpss.append(kps_standard) - - # Extract forced 203 landmarks if advanced editing features demand it - if requires_203: - kps_203_local = numpy.zeros((203, 2), dtype=numpy.float32) + if len(lm_203) > 0: + kps_203_local = lm_203 + has_valid_203 = True + filtered_kpss_203.append(kps_203_local) - # Optimization: If the user already selected 203 as standard, reuse it to save a CUDA call - if use_landmark and landmark_mode == "203" and len(kps_standard) == 203: - kps_203_local = kps_standard.copy() + # FW-LOGIC-FIX 2: Extract standard dense landmarks (68, 203 or 478 depending on UI selection) + if use_landmark: + # OPTIMIZATION: Reuse the 203 landmarks computed above ONLY IF + # the user explicitly enabled 'from_points' in the UI. + # This effectively prevents a redundant and costly neural network forward pass. + if ( + landmark_mode == "203" + and requires_203 + and has_valid_203 + and from_points + ): + kps_standard = kps_203_local.copy() else: - _, lm_203, _ = ( + _, lm_kpss, _ = ( self.main_window.models_processor.run_detect_landmark( frame_tensor, current_bbox, current_kps5, - detect_mode="203", - score=0.5, + detect_mode=landmark_mode, + score=control.get("LandmarkDetectScoreSlider", 50) / 100.0, use_mean_eyes=control.get("LandmarkMeanEyesToggle", False), - from_points=True, + from_points=from_points, ) ) - if len(lm_203) > 0: - kps_203_local = lm_203 + if len(lm_kpss) > 0: + kps_standard = lm_kpss - filtered_kpss_203.append(kps_203_local) + filtered_kpss.append(kps_standard) # Reformat output arrays to match the expected pipeline signature (CRITICAL TYPE CASTING) bboxes = numpy.array(filtered_bboxes, dtype=numpy.float32) diff --git a/app/processors/workers/frame_worker.py b/app/processors/workers/frame_worker.py index de2b065e..3bd49797 100644 --- a/app/processors/workers/frame_worker.py +++ b/app/processors/workers/frame_worker.py @@ -238,16 +238,8 @@ def __init__( ).view(1, 1, 3, 3) self.kernel_sobel_y = self.kernel_sobel_x.transpose(2, 3) - # FW-CPU-1: Per-worker CUDA stream, otherwise all 8 pool workers share the - # default stream and every `current_stream().synchronize()` call inside - # _run_model_with_lazy_build_check (face_swappers / face_masks / face_restorers / - # frame_enhancers / face_landmark_detectors) waits for *every* worker's - # pending GPU work. With CUDA 13's spin-wait scheduler that turns each pool - # worker into a 100%-CPU spinner; pinning each worker to its own stream - # cuts the sync surface back down to what the worker itself submitted. - self.worker_stream = ( - torch.cuda.Stream() if self.models_processor.device == "cuda" else None - ) + # Do not use local streams here ! Onnxruntime handles independent streams internally for each worker (fixes VRAM explosion) + self.worker_stream = None # (torch.cuda.Stream() if self.models_processor.device == "cuda" else None) def set_scaling_transforms(self, control_params): """Initializes the torchvision transforms based on user interpolation settings.""" @@ -1995,6 +1987,10 @@ def _process_frame_standard( k[:, 1] *= ratio_h # Manual Rotation + # FW-BUG-FIX: Store pre- and post-rotation dimensions to accurately + # reverse the affine transform later without accumulating black borders. + pre_rot_h, pre_rot_w = img.shape[1], img.shape[2] + if control["ManualRotationEnableToggle"]: img = v2.functional.rotate( img, @@ -2003,6 +1999,8 @@ def _process_frame_standard( expand=True, ) + post_rot_h, post_rot_w = img.shape[1], img.shape[2] + # --- DETECTION PHASE --- # The workers are now "Stateless Render Engines". They no longer track time or state. # They consume perfectly sequenced and EMA-smoothed detections from the Feeder thread. @@ -2053,7 +2051,10 @@ def _process_frame_standard( requires_203 = True break - # STEP 1: Standard detection respecting User's choice + # --- STEP 1: Standard detection (Respect UI Toggle) --- + # We pass 'use_landmark_detection=use_landmark' to allow run_detect + # to attempt an initial extraction. If Auto-Rotation is enabled, + # run_detect may bypass dense landmark extraction if the angle is too extreme. bboxes, kpss_5, kpss = self.models_processor.run_detect( img, control.get("DetectorModelSelection", "RetinaFace"), @@ -2063,7 +2064,7 @@ def _process_frame_standard( use_landmark_detection=use_landmark, landmark_detect_mode=landmark_mode, landmark_score=control.get("LandmarkDetectScoreSlider", 50) / 100.0, - from_points=from_points, + from_points=from_points, # Respects the UI toggle state rotation_angles=[0] if not control.get("AutoRotationToggle", False) else [0, 90, 180, 270], @@ -2072,32 +2073,87 @@ def _process_frame_standard( bypass_bytetrack=True, ) - # STEP 2: Smart Double-Scan for 203 points + # FW-LOGIC-FIX: Validate if Step 1 returned actual dense landmarks (> 5 points). + # If run_detect aborted dense extraction due to an extreme rotation angle, + # 'kpss' will merely contain a fallback copy of the sparse 'kpss_5'. + has_valid_dense_kpss = ( + kpss is not None + and len(kpss) == len(bboxes) + and len(bboxes) > 0 + and kpss[0].shape[0] > 5 + ) + + # --- STEP 2: 203-Landmark Extraction (Expression Restorer / Face Editor) --- kpss_203 = None if requires_203: - if use_landmark and landmark_mode == "203": - # OPTIMIZATION: 203 was already extracted by the user's choice. Zero CUDA cost. - kpss_203 = kpss - else: - # Extract 203 specifically for advanced features - _, _, kpss_203 = self.models_processor.run_detect( - img, - control.get("DetectorModelSelection", "RetinaFace"), - max_num=control.get("MaxFacesToDetectSlider", 1), - score=control.get("DetectorScoreSlider", 50) / 100.0, - input_size=(512, 512), - use_landmark_detection=True, - landmark_detect_mode="203", - landmark_score=control.get("LandmarkDetectScoreSlider", 50) - / 100.0, - from_points=from_points, - rotation_angles=[0] - if not control.get("AutoRotationToggle", False) - else [0, 90, 180, 270], - use_mean_eyes=control.get("LandmarkMeanEyesToggle", False), - previous_detections=None, - bypass_bytetrack=True, - ) + kpss_203_list = [] + # LOGIC FIX: We ONLY reuse Step 1 landmarks if they are already in the 203 + # format AND the user enabled 'from_points'. Otherwise, we MUST re-extract + # with 'from_points=True' to ensure geometric alignment for the Expression Restorer. + can_reuse_step1_203 = ( + has_valid_dense_kpss and landmark_mode == "203" and from_points + ) + + if bboxes is not None and len(bboxes) > 0: + for idx in range(len(bboxes)): + if can_reuse_step1_203: + kps_203_local = kpss[idx].copy() + else: + # FORCE from_points=True: Strictly required for the geometric + # alignment of advanced tools (FaceEditor, Makeup, Expressions). + _, lm_203, _ = self.models_processor.run_detect_landmark( + img, + bboxes[idx], + kpss_5[idx], + detect_mode="203", + score=0.5, + use_mean_eyes=control.get( + "LandmarkMeanEyesToggle", False + ), + from_points=True, # STRICTLY REQUIRED HERE + ) + kps_203_local = ( + lm_203 + if len(lm_203) > 0 + else np.zeros((203, 2), dtype=np.float32) + ) + kpss_203_list.append(kps_203_local) + kpss_203 = np.array(kpss_203_list, dtype=object) + + # --- STEP 3: Fallback for standard landmarks (UI Display) --- + if ( + use_landmark + and not has_valid_dense_kpss + and bboxes is not None + and len(bboxes) > 0 + ): + kpss_list = [] + for idx in range(len(bboxes)): + # Smart reuse: If Step 2 just computed 203 landmarks, use them + # to avoid a redundant neural network forward pass. + if ( + landmark_mode == "203" + and kpss_203 is not None + and len(kpss_203) > idx + ): + kpss_list.append(kpss_203[idx]) + else: + # Respects the UI toggle state for standard UI landmarks + _, lm_std, _ = self.models_processor.run_detect_landmark( + img, + bboxes[idx], + kpss_5[idx], + detect_mode=landmark_mode, + score=control.get("LandmarkDetectScoreSlider", 50) / 100.0, + use_mean_eyes=control.get("LandmarkMeanEyesToggle", False), + from_points=from_points, + ) + kpss_list.append( + lm_std + if len(lm_std) > 0 + else np.zeros((int(landmark_mode), 2), dtype=np.float32) + ) + kpss = np.array(kpss_list, dtype=object) if ( isinstance(kpss_5, np.ndarray) @@ -2422,12 +2478,71 @@ def _process_frame_standard( # Undo Rotation / Scaling if control["ManualRotationEnableToggle"]: + angle = control["ManualRotationAngleSlider"] + + # FW-BUG-FIX 1: Reverse rotation WITH expand=True so the canvas can physically + # accommodate the restored dimensions (fixes the 90/270 degree crop). + # Then center crop to strictly restore the original tensor shape. img = v2.functional.rotate( img, - angle=-control["ManualRotationAngleSlider"], + angle=-angle, interpolation=v2.InterpolationMode.BILINEAR, - expand=True, + expand=True, # CRITICAL: Was False, causing the image to be truncated ) + img = v2.functional.center_crop(img, (pre_rot_h, pre_rot_w)) + + # FW-MATH: Reverse the affine transform on all spatial coordinates. + import math + + rad = math.radians(angle) + cos_a, sin_a = math.cos(rad), math.sin(rad) + + cx_orig, cy_orig = pre_rot_w / 2.0, pre_rot_h / 2.0 + cx_rot, cy_rot = post_rot_w / 2.0, post_rot_h / 2.0 + + for fface in det_faces_data_for_display: + if fface.get("_rotation_rescaled"): + continue + + def unrotate_pts(pts): + if pts is None or len(pts) == 0: + return pts + # 1. Move origin to the rotated center + x0 = pts[:, 0] - cx_rot + y0 = pts[:, 1] - cy_rot + + # 2. Apply 2D inverse rotation matrix (Clockwise in Y-down coordinate system) + # BUG FIX: The previous signs were inverted, causing points to rotate further away! + x1 = x0 * cos_a - y0 * sin_a + y1 = x0 * sin_a + y0 * cos_a + + # 3. Move back to the original image center + pts[:, 0] = x1 + cx_orig + pts[:, 1] = y1 + cy_orig + return pts + + # Bounding box requires converting to 4 corners, un-rotating, and getting min/max bounds + if fface.get("bbox") is not None: + x1, y1, x2, y2 = fface["bbox"] + corners = np.array( + [[x1, y1], [x2, y1], [x2, y2], [x1, y2]], dtype=np.float32 + ) + unrot_corners = unrotate_pts(corners) + fface["bbox"][0] = np.min(unrot_corners[:, 0]) + fface["bbox"][1] = np.min(unrot_corners[:, 1]) + fface["bbox"][2] = np.max(unrot_corners[:, 0]) + fface["bbox"][3] = np.max(unrot_corners[:, 1]) + + # Un-rotate all Keypoint Arrays + if fface.get("kps_5") is not None: + fface["kps_5"] = unrotate_pts(fface["kps_5"]) + if fface.get("kps_all") is not None: + fface["kps_all"] = unrotate_pts(fface["kps_all"]) + if fface.get("kps_203") is not None: + fface["kps_203"] = unrotate_pts(fface["kps_203"]) + + fface["_rotation_rescaled"] = True + if scale_applied: # FW-QUAL-11: use renamed img_h/img_w variables # FW-PERF-08 / FW-MEM-02: LRU-bounded cache for the scale-back transform. diff --git a/app/ui/launcher/gittools.py b/app/ui/launcher/gittools.py index f975293a..b4e93305 100644 --- a/app/ui/launcher/gittools.py +++ b/app/ui/launcher/gittools.py @@ -20,9 +20,7 @@ from PySide6.QtWidgets import QApplication, QMessageBox -RELEASE_BAT_URL = ( - "https://github.com/VisoMasterFusion/VisoMaster-Fusion/releases/latest/download/Start_Portable.bat" -) +RELEASE_BAT_URL = "https://github.com/VisoMasterFusion/VisoMaster-Fusion/releases/latest/download/Start_Portable.bat" _remote_bat_cache: Path | None = None diff --git a/app/ui/widgets/actions/save_load_actions.py b/app/ui/widgets/actions/save_load_actions.py index dbd8fd27..be3e2541 100644 --- a/app/ui/widgets/actions/save_load_actions.py +++ b/app/ui/widgets/actions/save_load_actions.py @@ -483,7 +483,11 @@ def load_saved_workspace( # OPTIMIZED: Force PySide6 to process the pending 'thumbnail_ready' signals # before continuing, ensuring UI elements are fully instantiated. - QtWidgets.QApplication.processEvents() + while list_view_actions._has_pending_target_media_thumbnail_work( + main_window + ): + list_view_actions._flush_target_media_thumbnail_batch(main_window) + QtWidgets.QApplication.processEvents() # Select target media (Secured with .get to prevent KeyError on older workspaces) selected_media_id = data.get("selected_media_id", False) @@ -779,16 +783,18 @@ def load_saved_workspace( card_actions.uncheck_all_merged_embeddings(main_window) for input_face_id in first_face_button.assigned_input_faces.keys(): - input_face_button = main_window.input_faces.get(input_face_id) - if input_face_button: - input_face_button.setChecked(True) + assigned_input_btn = main_window.input_faces.get(input_face_id) + if assigned_input_btn: + assigned_input_btn.setChecked(True) for ( embedding_id ) in first_face_button.assigned_merged_embeddings.keys(): - embed_button = main_window.merged_embeddings.get(embedding_id) - if embed_button: - embed_button.setChecked(True) + assigned_embed_btn = main_window.merged_embeddings.get( + embedding_id + ) + if assigned_embed_btn: + assigned_embed_btn.setChecked(True) main_window.current_kv_tensors_map = getattr( first_face_button, "assigned_kv_map", None diff --git a/app/ui/widgets/denoiser_layout_data.py b/app/ui/widgets/denoiser_layout_data.py index 35775f21..b4bd3a59 100644 --- a/app/ui/widgets/denoiser_layout_data.py +++ b/app/ui/widgets/denoiser_layout_data.py @@ -18,7 +18,7 @@ "widget_type": "ParameterSlider", "label": "Base Seed", "control_name": "DenoiserBaseSeedSlider", - "min_value": "1", + "min_value": "0", # 0 is a valide value for the denoiser "max_value": "999", "default": "220", "step": 1, diff --git a/app/ui/widgets/swapper_layout_data.py b/app/ui/widgets/swapper_layout_data.py index 8ab62011..63d68488 100644 --- a/app/ui/widgets/swapper_layout_data.py +++ b/app/ui/widgets/swapper_layout_data.py @@ -188,7 +188,7 @@ "BordermaskEnableToggle": { "level": 1, "label": "Border Mask", - "default": False, + "default": True, # Default to True to prevent black square around the swap when no occlusion is selected "help": "A rectangle with adjustable bottom, left, right, top, and sides that masks the swapped face result back into the original image.", }, "BorderBottomSlider": { diff --git a/app/ui/widgets/ui_workers.py b/app/ui/widgets/ui_workers.py index b187cbf0..0d9edcea 100644 --- a/app/ui/widgets/ui_workers.py +++ b/app/ui/widgets/ui_workers.py @@ -46,6 +46,9 @@ def __init__( self.sort_files_list_by_name = sort_files_list_by_name self.webcam_mode = webcam_mode self._running = True # Flag to control the running state + self.control_snapshot = ( + main_window.control.copy() if getattr(main_window, "control", None) else {} + ) def run(self): if self.folder_name: @@ -69,7 +72,7 @@ def load_videos_and_images_from_folder(self, folder_name): self.main_window.placeholder_update_signal.emit( self.main_window.targetVideosList, True ) - recursive_toggle = self.main_window.control.get( + recursive_toggle = self.control_snapshot.get( "TargetMediaFolderRecursiveToggle", False ) @@ -92,7 +95,7 @@ def load_videos_and_images_from_folder(self, folder_name): self.main_window, media_file_path, file_type, - cache_thumbnail=False, + cache_thumbnail=True, ) media_id = self.media_ids[i] if self.media_ids else str(uuid.uuid1().int) @@ -131,7 +134,7 @@ def load_videos_and_images_from_files_list(self, files_list): self.main_window, media_file_path, file_type=file_type, - cache_thumbnail=False, + cache_thumbnail=True, ) if q_image: # Emit the signal to update GUI @@ -146,9 +149,9 @@ def load_webcams(self): self.main_window.targetVideosList, True ) camera_backend = CAMERA_BACKENDS[ - self.main_window.control.get("WebcamBackendSelection", "DirectShow") + self.control_snapshot.get("WebcamBackendSelection", "DirectShow") ] - max_no = int(self.main_window.control.get("WebcamMaxNoSelection", 1)) + max_no = int(self.control_snapshot.get("WebcamMaxNoSelection", 1)) for i in range(max_no): try: diff --git a/launcher.py b/launcher.py index a331370c..cc8537d9 100644 --- a/launcher.py +++ b/launcher.py @@ -28,7 +28,7 @@ if str(_repo_root) not in sys.path: sys.path.insert(0, str(_repo_root)) -from app.ui.launcher.main import main +from app.ui.launcher.main import main # noqa: E402 if __name__ == "__main__": main()