refine predict.py

Pandede · Pandede · commit fd6890ddac40 · 2025-06-02T16:12:53.000+08:00
diff --git a/predict.py b/predict.py
@@ -7,56 +7,51 @@
 from wpodnet import Predictor, load_wpodnet_from_checkpoint
 from wpodnet.stream import ImageStreamer
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = ArgumentParser()
+    parser.add_argument("source", type=str, help="the path to the image")
     parser.add_argument(
-        'source',
-        type=str,
-        help='the path to the image'
-    )
-    parser.add_argument(
-        '-w', '--weight',
-        type=str,
-        required=True,
-        help='the path to the model weight'
+        "-w", "--weight", type=str, required=True, help="the path to the model weight"
     )
     parser.add_argument(
-        '--scale',
+        "--scale",
         type=float,
         default=1.0,
-        help='adjust the scaling ratio. default to 1.0.'
+        help="adjust the scaling ratio. default to 1.0.",
     )
     parser.add_argument(
-        '--save-annotated',
+        "--save-annotated",
         type=str,
-        help='save the annotated image at the given folder'
+        help="save the annotated image at the given folder",
     )
     parser.add_argument(
-        '--save-warped',
-        type=str,
-        help='save the warped image at the given folder'
+        "--save-warped", type=str, help="save the warped image at the given folder"
     )
     args = parser.parse_args()
 
     if args.scale <= 0.0:
-        raise ArgumentTypeError(message='scale must be greater than 0.0')
+        raise ArgumentTypeError(message="scale must be greater than 0.0")
 
     if args.save_annotated is not None:
         save_annotated = Path(args.save_annotated)
         if not save_annotated.is_dir():
-            raise FileNotFoundError(errno.ENOTDIR, 'No such directory', args.save_annotated)
+            raise FileNotFoundError(
+                errno.ENOTDIR, "No such directory", args.save_annotated
+            )
     else:
         save_annotated = None
 
     if args.save_warped is not None:
         save_warped = Path(args.save_warped)
         if not save_warped.is_dir():
-            raise FileNotFoundError(errno.ENOTDIR, 'No such directory', args.save_warped)
+            raise FileNotFoundError(
+                errno.ENOTDIR, "No such directory", args.save_warped
+            )
     else:
         save_warped = None
 
     # Prepare for the model
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    device = "cuda" if torch.cuda.is_available() else "cpu"
     model = load_wpodnet_from_checkpoint(args.weight).to(device)
 
     predictor = Predictor(model)
@@ -65,20 +60,22 @@
     for i, image in enumerate(streamer):
         prediction = predictor.predict(image, scaling_ratio=args.scale)
 
-        print(f'Prediction #{i}')
-        print('  bounds', prediction.bounds.tolist())
-        print('  confidence', prediction.confidence)
+        print(f"Prediction #{i}")
+        print("  bounds", prediction.bounds)
+        print("  confidence", prediction.confidence)
 
         if save_annotated:
             annotated_path = save_annotated / Path(image.filename).name
-            annotated = prediction.annotate()
-            annotated.save(annotated_path)
-            print(f'Saved the annotated image at {annotated_path}')
+
+            canvas = image.copy()
+            prediction.annotate(canvas, outline="red")
+            canvas.save(annotated_path)
+            print(f"Saved the annotated image at {annotated_path}")
 
         if save_warped:
             warped_path = save_warped / Path(image.filename).name
-            warped = prediction.warp()
+            warped = prediction.warp(image)
             warped.save(warped_path)
-            print(f'Saved the warped image at {warped_path}')
+            print(f"Saved the warped image at {warped_path}")
 
         print()
diff --git a/wpodnet/backend.py b/wpodnet/backend.py
@@ -1,4 +1,5 @@
-from typing import List, Tuple
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
 
 import numpy as np
 import torch
@@ -8,66 +9,116 @@
 from .model import WPODNet
 
 
+@dataclass
 class Prediction:
-    def __init__(self, image: Image.Image, bounds: np.ndarray, confidence: float):
-        self.image = image
-        self.bounds = bounds
-        self.confidence = confidence
-
-    def _get_perspective_coeffs(self, width: int, height: int) -> List[float]:
-        # Get the perspective matrix
-        src_points = self.bounds.tolist()
-        dst_points = [[0, 0], [width, 0], [width, height], [0, height]]
-        return _get_perspective_coeffs(src_points, dst_points)
-
-    def annotate(self, outline: str = 'red', width: int = 3) -> Image.Image:
-        canvas = self.image.copy()
+    """
+    The prediction result from WPODNet.
+
+    Attributes:
+        bounds (List[Tuple[int, int]]): The bounding coordinates of the detected license plate. Must be a list of 4 points (x, y).
+        confidence (float): The confidence score of the detection. Must be between 0.0 and 1.0.
+    """
+
+    bounds: List[Tuple[int, int]]
+    confidence: float
+
+    def __post_init__(self):
+        if len(self.bounds) != 4:
+            raise ValueError(
+                f"expected bounds to have 4 points, got {len(self.bounds)} points"
+            )
+        if self.confidence < 0 or self.confidence > 1:
+            raise ValueError(
+                f"confidence must be between 0.0 and 1.0, got {self.confidence}"
+            )
+
+    def annotate(
+        self,
+        canvas: Image.Image,
+        fill: Optional[str] = None,
+        outline: Optional[str] = None,
+        width: int = 1,
+    ) -> None:
+        """
+        Annotates the image with the bounding polygon.
+
+        Args:
+            canvas (PIL.Image.Image): The image to be annotated.
+            fill (Optional[str]): The fill color for the polygon. Defaults to None.
+            outline (Optional[str]): The outline color for the polygon. Defaults to None.
+            width (int): The width of the outline. Defaults to 1.
+
+        Note:
+            The arguments `fill`, `outline`, and `width` are passed to the `ImageDraw.Draw.polygon` method.
+            See https://pillow.readthedocs.io/en/stable/reference/ImageDraw.html#PIL.ImageDraw.ImageDraw.polygon.
+        """
         drawer = ImageDraw.Draw(canvas)
-        drawer.polygon(
-            [(x, y) for x, y in self.bounds],
-            outline=outline,
-            width=width
+        drawer.polygon(self.bounds, fill=fill, outline=outline, width=width)
+
+    def warp(self, canvas: Image.Image) -> Image.Image:
+        """
+        Warps the image with perspective based on the bounding polygon.
+
+        Args:
+            canvas (PIL.Image.Image): The image to be warped.
+
+        Returns:
+            PIL.Image.Image: The warped image.
+        """
+        coeffs = _get_perspective_coeffs(
+            startpoints=self.bounds,
+            endpoints=[
+                (0, 0),
+                (canvas.width, 0),
+                (canvas.width, canvas.height),
+                (0, canvas.height),
+            ],
         )
-        return canvas
+        return canvas.transform(
+            (canvas.width, canvas.height), Image.Transform.PERSPECTIVE, coeffs
+        )
+
 
-    def warp(self, width: int = 208, height: int = 60) -> Image.Image:
-        # Get the perspective matrix
-        coeffs = self._get_perspective_coeffs(width, height)
-        warped = self.image.transform((width, height), Image.PERSPECTIVE, coeffs)
-        return warped
+Q = np.array(
+    [
+        [-0.5, 0.5, 0.5, -0.5],
+        [-0.5, -0.5, 0.5, 0.5],
+        [1.0, 1.0, 1.0, 1.0],
+    ]
+)
 
 
 class Predictor:
-    _q = np.array([
-        [-.5, .5, .5, -.5],
-        [-.5, -.5, .5, .5],
-        [1., 1., 1., 1.]
-    ])
-    _scaling_const = 7.75
-    _stride = 16
-
-    def __init__(self, wpodnet: WPODNet):
+    """A wrapper class for WPODNet to make predictions."""
+
+    def __init__(self, wpodnet: WPODNet) -> None:
+        """
+        Args:
+            wpodnet (WPODNet): The WPODNet model to use for prediction.
+        """
         self.wpodnet = wpodnet
         self.wpodnet.eval()
 
-    def _resize_to_fixed_ratio(self, image: Image.Image, dim_min: int, dim_max: int) -> Image.Image:
+    def _resize_to_fixed_ratio(
+        self, image: Image.Image, dim_min: int, dim_max: int
+    ) -> Image.Image:
         h, w = image.height, image.width
 
         wh_ratio = max(h, w) / min(h, w)
         side = int(wh_ratio * dim_min)
-        bound_dim = min(side + side % self._stride, dim_max)
+        bound_dim = min(side + side % self.wpodnet.stride, dim_max)
 
         factor = bound_dim / max(h, w)
         reg_w, reg_h = int(w * factor), int(h * factor)
 
-        # Ensure the both width and height are the multiply of `self._stride`
-        reg_w_mod = reg_w % self._stride
+        # Ensure the both width and height are the multiply of `self.wpodnet.stride`
+        reg_w_mod = reg_w % self.wpodnet.stride
         if reg_w_mod > 0:
-            reg_w += self._stride - reg_w_mod
+            reg_w += self.wpodnet.stride - reg_w_mod
 
-        reg_h_mod = reg_h % self._stride
+        reg_h_mod = reg_h % self.wpodnet.stride
         if reg_h_mod > 0:
-            reg_h += self._stride - reg_h_mod
+            reg_h += self.wpodnet.stride - reg_h_mod
 
         return image.resize((reg_w, reg_h))
 
@@ -82,32 +133,56 @@ def _inference(self, image: torch.Tensor) -> Tuple[np.ndarray, np.ndarray]:
         # Convert to squeezed numpy array
         # grid_w: The number of anchors in row
         # grid_h: The number of anchors in column
-        probs = np.squeeze(probs.cpu().numpy())[0]     # (grid_h, grid_w)
+        probs = np.squeeze(probs.cpu().numpy())[0]  # (grid_h, grid_w)
         affines = np.squeeze(affines.cpu().numpy())  # (6, grid_h, grid_w)
 
         return probs, affines
 
     def _get_max_anchor(self, probs: np.ndarray) -> Tuple[int, int]:
         return np.unravel_index(probs.argmax(), probs.shape)
 
-    def _get_bounds(self, affines: np.ndarray, anchor_y: int, anchor_x: int, scaling_ratio: float = 1.0) -> np.ndarray:
+    def _get_bounds(
+        self,
+        affines: np.ndarray,
+        anchor_y: int,
+        anchor_x: int,
+        scaling_ratio: float = 1.0,
+    ) -> np.ndarray:
         # Compute theta
         theta = affines[:, anchor_y, anchor_x]
         theta = theta.reshape((2, 3))
         theta[0, 0] = max(theta[0, 0], 0.0)
         theta[1, 1] = max(theta[1, 1], 0.0)
 
         # Convert theta into the bounding polygon
-        bounds = np.matmul(theta, self._q) * self._scaling_const * scaling_ratio
+        bounds = np.matmul(theta, Q) * self.wpodnet.scale_factor * scaling_ratio
 
         # Normalize the bounds
         _, grid_h, grid_w = affines.shape
-        bounds[0] = (bounds[0] + anchor_x + .5) / grid_w
-        bounds[1] = (bounds[1] + anchor_y + .5) / grid_h
+        bounds[0] = (bounds[0] + anchor_x + 0.5) / grid_w
+        bounds[1] = (bounds[1] + anchor_y + 0.5) / grid_h
 
         return np.transpose(bounds)
 
-    def predict(self, image: Image.Image, scaling_ratio: float = 1.0, dim_min: int = 512, dim_max: int = 768) -> Prediction:
+    def predict(
+        self,
+        image: Image.Image,
+        scaling_ratio: float = 1.0,
+        dim_min: int = 512,
+        dim_max: int = 768,
+    ) -> Prediction:
+        """
+        Detect license plate in the image.
+
+        Args:
+            image (Image.Image): The image to be detected.
+            scaling_ratio (float): The scaling ratio of the resulting bounding polygon. Default to 1.0.
+            dim_min (int): The minimum dimension of the resized image. Default to 512
+            dim_max (int): The maximum dimension of the resized image. Default to 768
+
+        Returns:
+            Prediction: The prediction result with highest confidence.
+        """
         orig_h, orig_w = image.height, image.width
 
         # Resize the image to fixed ratio
@@ -130,7 +205,6 @@ def predict(self, image: Image.Image, scaling_ratio: float = 1.0, dim_min: int =
         bounds[:, 1] *= orig_h
 
         return Prediction(
-            image=image,
-            bounds=bounds.astype(np.int32),
-            confidence=max_prob.item()
+            bounds=[(x, y) for x, y in np.int32(bounds).tolist()],
+            confidence=max_prob.item(),
         )