diff --git a/Dockerfile b/Dockerfile index 32fa8f5..1cd6af5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,8 @@ FROM nvcr.io/nvidia/pytorch:24.06-py3 WORKDIR /app COPY . ./ +ENV PLANTNET_API_KEY=2b10tSubhbpUaT0XF3sNpl0hYe + # 2. Install dependencies RUN pip install --upgrade pip RUN pip install -r requirements.txt @@ -18,18 +20,8 @@ RUN python -c "import cv2; print('OpenCV imported successfully')" # 3. Create a local cache directory for the model RUN mkdir -p /hf_cache/microsoft/Florence-2-base -# RUN mkdir -p /hf_cache/microsoft/Florence-2-large - -# 4. Download the model files info /hf_cache/microsoft/Florence-2-large -RUN huggingface-cli download \ - microsoft/Florence-2-large \ - --repo-type model \ - --cache-dir /hf_cache \ - --local-dir /hf_cache/microsoft/Florence-2-large \ - --resume \ - --force # <-- ensures we overwrite any existing files or resume a download -# 5. Download the model files info /hf_cache/microsoft/Florence-2-base +# 4. Download the model files info /hf_cache/microsoft/Florence-2-base RUN huggingface-cli download \ microsoft/Florence-2-base \ --repo-type model \ @@ -38,7 +30,7 @@ RUN huggingface-cli download \ --resume \ --force # <-- ensures we overwrite any existing files or resume a download -# 6. Set the environment variables for offline mode +# 5. Set the environment variables for offline mode ENV HF_HOME=/hf_cache ENV TRANSFORMERS_OFFLINE=1 ENV HF_DATASETS_OFFLINE=1 diff --git a/README.md b/README.md index 588332e..b12cefc 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,58 @@ # PTZ APP -This is an application for sending images of specific objects autonomously using PTZ cameras. +This is an intelligent, autonomous PTZ camera application that uses advanced vision-language models (YOLO or Florence-2) to detect, frame, and analyze objects of interest in real-time. It is designed for deployment on edge computing nodes within the Sage project. + +--- + +## What’s New + +- **Florence-2 Enhancements** + - Automatic scene context generation at the start of scans. + - Support for manual prompt injection with `--prompt_prefix` to guide detections. + +- **Enhanced Data Logging** + - Publishes scene captions (Florence-2) and raw detection data (label, confidence, position) with each scan. + - Improved debug-level logging. + +- **Pipeline Updates** + - Added Scene Analysis step (Florence-2 only). + - Data Publishing step now explicitly includes metadata publishing. + +--- ## How It Works The algorithm performs the following steps: -1. **Initialization**: Sets up object detection model (YOLO or Florence) based on user parameters. +1. **Initialization** + Sets up the object detection model (`YOLO` or `Florence-2`) based on user parameters. -2. **Area Scanning**: Systematically scans the environment by rotating the PTZ camera in pan steps (default: 15 degrees) through a full 360° rotation at the specified tilt and zoom level. +2. **Scene Analysis (Florence-2 Only)** + At the start of a scan, Florence-2 can automatically generate a detailed text caption of the current scene to use as a dynamic, contextual prompt. -3. **Object Detection**: At each camera position, captures an image and runs object detection to identify specified objects (e.g., person, car, dog). +3. **Contextual Area Scanning** + Systematically scans the environment by rotating the PTZ camera in pan steps (default: 15°) through a full 360° rotation at the specified tilt and zoom level. + When using Florence-2, the generated (or user-specified) context is incorporated to improve detection relevance. -4. **Filtering**: Filters detections based on confidence threshold (default: 0.1). +4. **Object Detection** + At each camera position, captures an image and runs object detection to identify specified objects (e.g., person, car, dog). -5. **Object Tracking**: When an object of interest is detected with sufficient confidence, the algorithm: +5. **Filtering** + Filters detections based on confidence threshold (default: 0.1). + +6. **Object Tracking** + When an object of interest is detected with sufficient confidence, the algorithm: - Centers the camera on the detected object - Adjusts zoom to maximize the object in the frame -6. **Image Publishing**: Saves and publishes the optimized images of detected objects. +7. **Data Publishing** + Saves and publishes the optimized images of detected objects. + Publishes rich metadata—including scene captions (Florence-2), raw detection data (labels, confidence, position), and logging outputs—to the Sage data portal. + +8. **Iteration** + Repeats the process for the specified number of iterations with configurable delay between scans. -7. **Iteration**: Repeats the process for the specified number of iterations with configurable delay between scans. +--- ## Build the container @@ -51,6 +83,24 @@ sudo docker run -it --rm your_docker_hub_user_name/ptzapp:latest -ki -it 5 -un c ```bash sudo docker run --gpus all -it --rm your_docker_hub_user_name/ptzapp:latest --model Florence-base --iterations 5 --username username --password 'password' --cameraip 130.202.23.92 --objects 'person,car' ``` +## Advanced Usage with Florence-2 + +### Fully Autonomous Mode (Automatic Context) + +When using Florence-2 without a manual prompt, the application will automatically analyze the scene to generate its own context before searching for objects: + +```bash +sudo docker run --gpus all -it --rm your_docker_hub_user_name/ptzapp:latest --model Florence-base --objects 'animal,bird,deer' --username --password '' --cameraip +``` + +### Manual Context Prompt + +You can provide your own context to the model using the `--prompt_prefix` argument to guide detections: + +```bash +sudo docker run --gpus all -it --rm your_docker_hub_user_name/ptzapp:latest --model Florence-base --objects 'animal,bird' --prompt_prefix 'A photo from a trail camera in a wilderness environment' --username --password '' --cameraip +``` + ## Using Different Object Detection Models @@ -94,4 +144,91 @@ sudo docker run --gpus all -it --rm your_docker_hub_user_name/ptzapp:latest --mo | `--zoom` | `-zm` | Zoom value | 1 | | `--confidence` | `-conf` | Confidence threshold (0-1) | 0.1 | | `--iterdelay` | `-id` | Minimum delay in seconds between iterations | 60.0 | +| `--prompt_prefix` | | Manual text prompt for Florence-2 context | "" | | `--debug` | | Enable debug level logging | False | + +## Environment Variables +1. `PLANTNET_API_KEY` — for PlantNet API calls +2. `BLUR_MIN` — Laplacian variance threshold to trigger a focus retry (default ~120) +3. `SPECIES_MIN_SCORE` — minimum PlantNet confidence to treat as “confident” (e.g., 0.25) + +## Results & Observations + +When you launch the app (either with python main.py … or via the Docker command shown above), you’ll see three kinds of outputs: + +1. Console logs +2. Saved images under /imgs (mount this to a host folder to persist) +3. Published messages (via Waggle plugin) containing detections & species results + +### 1. Console Logs +You should see a sequence like: +- Scene caption (Florence only, if enabled) + ```bash + Generating dynamic context caption for the scene... + Scene Context: "The image shows a red fence with multiple rows of small holes..." + ``` +- PTZ sweep & detection + ```bash + Trying PTZ: 0 0 1 + Published detection: ptz.detection.p0t0z1 + Plant detected (trees). Starting species identification workflow... + ``` +- Centering & zoom math (in degrees) and a best-of-N capture with blur score + ```bash + CAMERA MOVEMNET + zoom_level: + current_h_fov: + current_v_fov: + Move the camera to center the object + Pan: + Tilt: + + Taking final snapshot(s) for PlantNet... (Example output) + [PLANTNET] using image -> /imgs/50.19,-13.11,11.38_plantnet_try_2025-09-10_23:19:17.327619.jpg (blur=9242.4) + ``` +- PlantNet result (success) + ```bash + Species: Quercus garryana + Common Names: ['Garry oak', 'Oregon oak', 'Oregon white oak'] + Score: 0.3217 + ``` +- PlantNet error example (no match / 404) (Does not publish misclassification) + ```bash + PlantNet identification failed: PlantNet API request failed with status 404: {"statusCode":404,"error":"Not Found","message":"Species not found"} + ``` + +### 2) Saved Images +All captured frames land in /imgs inside the container. Mount it to your host to persist. +Filename format: +```bash + ,,__.jpg +``` +Without --keepimages, interim candidates may be cleaned up; the final selection is kept when you mount /imgs + +### 3) Published Messages (Waggle) - Sample output +- Scene caption (Florence): ptz.scene.caption — free-text description +- Per-position detection: ptz.detection.p{pan}t{tilt}z{zoom} +- Blur/sharpness telemetry: ptz.image.blur +- PlantNet species (if any): ptz.plantnet.species +- Plain score: ptz.plantnet.score +- Alerts (optional, via alert_system.py): ptz.alert. with the species JSON + +### What a “Good” Run Looks Like + +- Multiple `Trying PTZ: …` lines per iteration +- At least one `ptz.detection.p...` with confidence ≥ your `--confidence` +- For plant labels: centering/zoom logs, blur telemetry, PlantNet success block or a clear error +- Images written to `imgs/` (mount or `--keepimages`) + +### Troubleshooting +- No species shown: PlantNet may return 404/no match. Ensure `PLANTNET_API_KEY` is set; improve view (more leaves/flowers, less backlight), increase `--species_zoom`, or adjust framing. +- Detections but no centering/zoom: Detection didn’t pass `--confidence`. Lower it slightly or ensure your `--objects` include plant terms (plant,tree,flower,bush,wildflower…). +- No images on host: Mount `/imgs` (`-v "$(pwd)/imgs":/imgs`) or use `--keepimages`. +- Soft images (low blur): Increase settle delays, try a larger `--species_zoom`, or lower `BLUR_MIN` to reduce retries. + +### How It Works +- Detect objects with YOLO or Florence-2. +- Route plants via a keyword map (tree, bush, flower, plant, …). +- Center & maximize the bbox using FOV-based pan/tilt and relative zoom. +- Best-of-N capture with Laplacian variance; pick the sharpest (optionally focus-jiggle retry). +- PlantNet identify and publish results + blur telemetry + optional alerts. diff --git a/ecr-meta/ecr-icon.jpg b/ecr-meta/ecr-icon.jpg index 2a261ea..b23b7d3 100644 Binary files a/ecr-meta/ecr-icon.jpg and b/ecr-meta/ecr-icon.jpg differ diff --git a/ecr-meta/ecr-science-description.md b/ecr-meta/ecr-science-description.md index b23e1fb..1f6ed95 100644 --- a/ecr-meta/ecr-science-description.md +++ b/ecr-meta/ecr-science-description.md @@ -7,7 +7,10 @@ The application can deploy either **YOLO** (yolov8-yolo11n) or **Florence v2** m The workflow is: 1. The camera rotates (pan/tilt) and zooms in pre-determined or incremental steps to scan the environment. 2. Live frames are captured and processed by the selected AI model (YOLO or Florence v2). -3. If an object of interest is detected with sufficient confidence, the system automatically adjusts the PTZ camera to center and maximize the object in the frame. +3. If an object of interest is detected with sufficient confidence, the system automatically adjusts the PTZ camera to center and maximize the object in the frame. + i. After centering/zooming on plants, the app can optionally perform species identification using the PlantNet API. + ii. It captures several bracketed snapshots at different zooms, ranks them by sharpness (variance of Laplacian), and submits the sharpest image to PlantNet. + iii. Results (top candidate and optional top-K) are published as Waggle telemetry, with an optional alert if the species matches a monitored list (e.g., invasive/rare) 4. A picture is taken and sent to the cloud infrastructure for further processing, archiving, or real-time alerts. By pushing this AI capability to the edge, the system operates continuously with minimal latency and reduced bandwidth usage—uploading only relevant snapshots rather than a constant video feed. @@ -24,6 +27,23 @@ By pushing this AI capability to the edge, the system operates continuously with - Can detect virtually any object when used with the wildcard (`*`) parameter - Operates in `` task mode for general object detection - More resource-intensive but provides greater detection flexibility +- Can optionally caption the scene to build context for detection +- Used in (object detection) mode for general objects, then branches to PlantNet when labels resemble plants + +### Image Quality & Focus +- Sharpness metric: variance of Laplacian (higher = sharper). +- Telemetry: publishes blur as ptz.image.blur with blur_var_laplacian. +- Refocus gate: if blur < BLUR_MIN (env), a short focus pulse is attempted before retrying. +- Settle delays: short sleeps after pan/tilt/zoom to allow AF/exposure to stabilize. + +### Telemetry Topics (Waggle) +- ptz.detection.p{pan}t{tilt}z{zoom} — label, confidence, bbox, PTZ pose, timestamp +- ptz.scene.caption — optional Florence scene caption used as context +- ptz.image.blur — blur metric for the chosen PlantNet frame +- ptz.plantnet.candidates — (debug) top-K candidates with scores +- ptz.plantnet.species — final published species (gated by SPECIES_MIN_SCORE) +- ptz.plantnet.score — convenience score metric +- ptz.alert.{type} — alert on invasive/rare species (if configured) # Arguments The application supports the following command-line arguments: @@ -53,6 +73,18 @@ The application supports the following command-line arguments: Keep collected images in persistent folder for later use (Default: False) - **`--debug`** Enable debug level logging (Default: False) +- **`--prompt_prefix`** + Optional text prefix to add context for Florence prompts (empty = auto caption). +- **`--species_zoom`** + Extra relative zoom step used for species detail (Default: 10). + +### Environment Variables +- **`PLANTNET_API_KEY`** + Required for PlantNet API calls. +- **`BLUR_MIN`** + Laplacian variance threshold to trigger a focus retry (default ~120). +- **`SPECIES_MIN_SCORE`** + Minimum PlantNet confidence to treat as “confident” (eg 0.25). ## Example Usage @@ -71,6 +103,14 @@ python main.py -it 5 -obj "person,car,dog" -un admin -pw secret -ip 192.168.1.10 python main.py -it 5 -obj "*" -un username -pw 'password' -ip 130.202.23.92 -m Florence-base -conf 0.15 ``` +### Using Florence for Plant species detection +```bash +PLANTNET_API_KEY=... BLUR_MIN=70 SPECIES_MIN_SCORE=0.25 \ +python main.py \ + -it 3 -obj "plant,tree" -un camera -pw 'secret' -ip 192.168.1.100 \ + -m Florence-base --species_zoom 10 --iterdelay 0 --debug +``` + # Ontology The interesting images collected by the system are tagged with metadata for easy retrieval and analysis. This includes: @@ -80,4 +120,12 @@ The interesting images collected by the system are tagged with metadata for easy - Camera position (pan, tilt, zoom) - Location data (if available) +In addition to existing fields, images and messages may include: +- Species (scientific) +- Common names +- PlantNet score +- Blur sharpness (blur_var_laplacian) +- Candidates (top-K species + scores, debug) +- These fields enable downstream filtering by species, quality scoring, and confidence-based triage. + This metadata enables systematic analysis of object presence, movement patterns, and temporal dynamics in the monitored environment. diff --git a/ecr-meta/ecr-science-image.jpg b/ecr-meta/ecr-science-image.jpg index 2a261ea..ccb3ded 100644 Binary files a/ecr-meta/ecr-science-image.jpg and b/ecr-meta/ecr-science-image.jpg differ diff --git a/main.py b/main.py index cc21c76..2f45d44 100644 --- a/main.py +++ b/main.py @@ -3,13 +3,33 @@ import argparse import time from PIL import Image +from waggle.plugin import Plugin +import json +import logging +import cv2 +import numpy as np +from source import alert_system from source.bring_data import ( center_and_maximize_object, get_image_from_ptz_position, publish_images, + grab_image, ) -from source.object_detector import DetectorFactory -import logging +from source import plantnet_client, sunapi_control as camera_control +from source.object_detector import DetectorFactory, FlorenceDetector + +# ---- Plant label helpers +PLANT_KEYWORDS = { + 'plant','flower','tree','wildflower','bush','shrub','vegetation', + 'leaf','leaves','branch','branches','trunk','potted plant','potted' +} +LARGE_PLANT = {'tree', 'bush'} + +# ---- Tunables (can override via env) +BLUR_MIN = float(os.getenv("BLUR_MIN", "70")) # refocus threshold +SPECIES_MIN_SCORE = float(os.getenv("SPECIES_MIN_SCORE", "0.25")) # publish gate + +logger = logging.getLogger(__name__) def get_argparser(): @@ -77,6 +97,18 @@ def get_argparser(): type=float, default=60.0, ) + parser.add_argument( + "--prompt_prefix", + help="An optional prefix to add to the Florence prompt for context.", + type=str, + default="", + ) + parser.add_argument( + "--species_zoom", + help="Additional relative zoom steps to apply for species identification (default=10).", + type=int, + default=10, + ) parser.add_argument( "-conf", "--confidence", @@ -88,6 +120,35 @@ def get_argparser(): return parser +def _blur_score(path: str) -> float: + """Variance of Laplacian: higher = sharper.""" + try: + img = cv2.imread(path, cv2.IMREAD_GRAYSCALE) + if img is None: + return 0.0 + return float(cv2.Laplacian(img, cv2.CV_64F).var()) + except Exception: + return 0.0 + + +def _plantnet_topk(raw: dict, k: int = 3): + """Extract a compact top-k from PlantNet raw JSON.""" + out = [] + try: + results = (raw or {}).get("results") or [] + for r in results[:k]: + sp = r.get("species", {}) or {} + name = sp.get("scientificNameWithoutAuthor") or sp.get("scientificName") + out.append({ + "species": name, + "common_names": sp.get("commonNames", []), + "score": round(float(r.get("score", 0.0)), 4), + }) + except Exception: + pass + return out + + def look_for_object(args): objects = [obj.strip().lower() for obj in args.objects.split(",")] pans = [angle for angle in range(0, 360, args.panstep)] @@ -100,41 +161,225 @@ def look_for_object(args): print(f"Error creating detector: {str(e)}") sys.exit(1) - for iteration in range(args.iterations): - iteration_start_time = time.time() + with Plugin() as plugin: + for iteration in range(args.iterations): + iteration_start_time = time.time() - for pan, tilt, zoom in zip(pans, tilts, zooms): - print(f"Trying PTZ: {pan} {tilt} {zoom}") - image_path, detection = get_image_from_ptz_position( - args, objects, pan, tilt, zoom, detector, None - ) + # --- Dynamic scene caption for Florence (only if no manual prompt) --- + dynamic_prompt_prefix = args.prompt_prefix + if not dynamic_prompt_prefix and isinstance(detector, FlorenceDetector): + print("Generating dynamic context caption for the scene...") + try: + cam = camera_control.CameraControl(args.cameraip, args.username, args.password) + cam.absolute_control(pans[0], tilts[0], zooms[0]) + temp_image_path = grab_image(camera=cam, args=args, action="caption_shot") + if temp_image_path: + try: + with Image.open(temp_image_path) as _im: + _im.load() + dynamic_prompt_prefix = detector.caption(_im) + finally: + try: + os.remove(temp_image_path) + except Exception: + pass + print(f"Scene Context: \"{dynamic_prompt_prefix}\"") + plugin.publish("ptz.scene.caption", dynamic_prompt_prefix) + except Exception as e: + print(f"Could not generate dynamic caption: {e}") - if detection is None or detection["reward"] > (1 - args.confidence): - if image_path and os.path.exists(image_path): - os.remove(image_path) - continue + # --- Sweep PTZ positions --- + for pan, tilt, zoom in zip(pans, tilts, zooms): + print(f"Trying PTZ: {pan} {tilt} {zoom}") + image_path, detection = get_image_from_ptz_position( + args, objects, pan, tilt, zoom, detector, None, dynamic_prompt_prefix + ) + + # Nothing useful? clean any temp and continue + if detection is None or detection["reward"] > (1 - args.confidence): + if image_path and os.path.exists(image_path): + try: + os.remove(image_path) + except Exception: + pass + continue + + # Publish initial detection telemetry (add raw_reward for later calibration) + detection_name = f"ptz.detection.p{pan}t{tilt}z{int(zoom)}" + detection_payload = { + "label": detection["label"], + "confidence": round(1 - detection["reward"], 4), + "raw_reward": detection["reward"], + "bbox": detection["bbox"], + "ptz_position": [pan, tilt, zoom], + "timestamp": time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) + } + plugin.publish(detection_name, json.dumps(detection_payload)) + print(f"Published detection: {detection_name}") + + # Decide path by label — load/copy so we can safely delete later + with Image.open(image_path) as _im: + _im.load() + image = _im.copy() + + # Normalize label and handle plurals/hyphens + label_norm = detection["label"].lower().strip() + label_norm = label_norm.replace('-', ' ') # "wild-flower" -> "wild flower" + root = label_norm[:-1] if label_norm.endswith('s') else label_norm # "trees" -> "tree" + tokens = set(label_norm.split()) + is_plant = ( + root in PLANT_KEYWORDS + or label_norm in PLANT_KEYWORDS + or any(tok in PLANT_KEYWORDS for tok in tokens) + ) + + if is_plant: + # --- PLANT WORKFLOW: center/zoom + best-of-N sharpest PlantNet shot --- + print(f"Plant detected ({detection['label']}). Starting species identification workflow...") + try: + # Step 1: center + frame whole plant + center_and_maximize_object(args, detection["bbox"], image, + detection["reward"], detection["label"]) + + # Step 2: if large plant, add an extra zoom for details + if root in LARGE_PLANT and args.species_zoom > 0: + print(f"Performing additional zoom ({args.species_zoom}) for species detail...") + cam = camera_control.CameraControl(args.cameraip, args.username, args.password) + cam.relative_control(pan=0, tilt=0, zoom=args.species_zoom) + time.sleep(2) + + # Step 3: Best-of-N shots with blur gate + print("Taking final snapshot(s) for PlantNet...") + cam = camera_control.CameraControl(args.cameraip, args.username, args.password) + + def _take_stable_shot(action_label: str, settle: float = 1.0): + time.sleep(settle) # AF/exposure settle + return grab_image(camera=cam, args=args, action=action_label) - label = detection["label"] - bbox = detection["bbox"] - reward = detection["reward"] - confidence = 1 - reward + zoom_increments = [0] + if args.species_zoom > 0: + zoom_increments.extend([max(1, args.species_zoom // 2), args.species_zoom]) - print(f"Following {label} object (confidence: {confidence:.2f})") + candidates = [] + for dz in zoom_increments: + try: + if dz: + cam.relative_control(pan=0, tilt=0, zoom=dz) + time.sleep(0.5) + shot_path = _take_stable_shot("plantnet_try", settle=1.0) + if shot_path: + candidates.append((shot_path, _blur_score(shot_path))) + except Exception as e: + print("Bracket shot failed:", e) - image = Image.open(image_path) - center_and_maximize_object(args, bbox, image, reward, label) + if not candidates: + print("No candidate images captured for PlantNet.") + else: + # pick sharpest + candidates.sort(key=lambda t: t[1], reverse=True) + final_image_path, best_blur = candidates[0] + print(f"[PLANTNET] using image -> {final_image_path} (blur={best_blur:.1f})") - if os.path.exists(image_path): - os.remove(image_path) + # optional: one focus jiggle retry if still soft + if best_blur < BLUR_MIN: + try: + cam.continuous_control(focus='Near'); time.sleep(0.3) + cam.continuous_control(focus='Stop'); time.sleep(0.6) + retry_path = _take_stable_shot("plantnet_refocus", settle=1.0) + if retry_path: + retry_blur = _blur_score(retry_path) + if retry_blur > best_blur: + if not args.keepimages and final_image_path and final_image_path != retry_path: + try: + os.remove(final_image_path) + except Exception: + pass + final_image_path, best_blur = retry_path, retry_blur + else: + if not args.keepimages and retry_path: + try: + os.remove(retry_path) + except Exception: + pass + except Exception as e: + print("Focus jiggle retry failed:", e) + + # publish blur telemetry + plugin.publish("ptz.image.blur", json.dumps({ + "blur_var_laplacian": round(best_blur, 2), + "ptz_position": [pan, tilt, zoom] + })) + + # PlantNet identify + try: + species_results = plantnet_client.identify_plant(final_image_path) + if species_results: + # Optional top-k telemetry from raw + topk = _plantnet_topk(species_results.get("raw", {}), k=3) + if topk: + plugin.publish("ptz.plantnet.topk", json.dumps({ + "ptz_position": [pan, tilt, zoom], + "candidates": topk + })) + + # Gate publishing on score + score = float(species_results.get("score", 0.0)) + species_name = species_results.get("species") + + if species_name and score >= SPECIES_MIN_SCORE: + lean = { + "species": species_name, + "common_names": species_results.get("common_names", []), + "score": round(score, 4), + "blur_var_laplacian": round(best_blur, 2), + "ptz_position": [pan, tilt, zoom], + "timestamp": time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) + } + print(f"Published plantnet species: {lean}") + plugin.publish("ptz.plantnet.species", json.dumps(lean)) + plugin.publish("ptz.plantnet.score", str(lean["score"])) + + # Alerts on invasive/rare + alert_type, _ = alert_system.check_for_alert(species_name) + if alert_type: + print(f"!!! ALERT: {alert_type} '{species_name}' detected! Publishing alert.") + plugin.publish(f"ptz.alert.{alert_type}", json.dumps(lean)) + else: + print(f"Species score low ({score:.2f}) or name missing; skipping publish.") + except Exception as e: + print(f"PlantNet identification failed: {e}") + + # cleanup non-selected candidates if not keeping images + if not args.keepimages: + for p, _b in candidates[1:]: + try: + os.remove(p) + except Exception: + pass + except Exception as e: + print(f"PlantNet identification pipeline failed: {e}") + else: + # --- OTHER OBJECTS: follow & maximize in frame --- + print(f"Following {detection['label']} object (confidence: {1 - detection['reward']:.2f})") + center_and_maximize_object(args, detection["bbox"], image, + detection["reward"], detection["label"]) + + # always clean initial temp image from this PTZ step + if image_path and os.path.exists(image_path): + try: + os.remove(image_path) + except Exception: + pass - publish_images() + # publish any saved images and wait for next iteration + publish_images(args.keepimages) - iteration_time = time.time() - iteration_start_time - if args.iterdelay > 0: - remaining_delay = max(0, args.iterdelay - iteration_time) - if remaining_delay > 0: - print(f"Waiting {remaining_delay:.2f} seconds before next iteration...") - time.sleep(remaining_delay) + iteration_time = time.time() - iteration_start_time + if args.iterdelay > 0: + remaining_delay = max(0, args.iterdelay - iteration_time) + if remaining_delay > 0: + print(f"Waiting {remaining_delay:.2f} seconds before next iteration...") + time.sleep(remaining_delay) def main(): diff --git a/requirements.txt b/requirements.txt index e6077ea..4347be3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ pywaggle[all] huggingface_hub ultralytics>=8.3.70 torch>=2.0.0 -transformers>=4.30.0 +transformers==4.40.2 pillow>=9.0.0 -numpy>=1.21.0 +numpy<2.0 requests>=2.25.0 diff --git a/sage.yaml b/sage.yaml index 99c2fa9..e85976c 100644 --- a/sage.yaml +++ b/sage.yaml @@ -1,18 +1,18 @@ -name: "ptzapp-yolo" +name: "ptzapp" namespace: "waggle" -description: "Intelligent Pan-Tilt-Zoom Camera - modified for YOLOv8-YOLO11 use" -version: "0.1.14" -authors: "Dario Dematties , Peter Lebiedzinski " +description: "Intelligent Pan-Tilt-Zoom camera app with YOLO/Florence detection and PlantNet species ID" +version: "0.2.0" +authors: "Saumya Pailwan , Dario Dematties , Peter Lebiedzinski " collaborators: "The Sage Team" funding: "NSF 1935984, DOE" license: "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License" -keywords: "camera, ptz, pan, tilt, zoom, florence, waggle" -homepage: "https://github.com/plebbyd/PTZ_APP" +keywords: "camera, ptz, pan, tilt, zoom, yolo, florence, plantnet, waggle" +homepage: "https://github.com/saumya-pailwan/PTZ_APP.git" source: architectures: - "linux/amd64" - "linux/arm64" - url: "https://github.com/plebbyd/PTZ_APP.git" + url: "https://github.com/saumya-pailwan/PTZ_APP.git" branch: "main" inputs: @@ -42,3 +42,7 @@ inputs: type: "boolean" - id: "debug" type: "boolean" + - id: "prompt_prefix" + type: "string" + - id: "species_zoom" + type: "int" diff --git a/saved_images/90.0,0.0,1.0_17_2025-08-21_06:39:12.326315.jpg b/saved_images/90.0,0.0,1.0_17_2025-08-21_06:39:12.326315.jpg new file mode 100644 index 0000000..2504340 Binary files /dev/null and b/saved_images/90.0,0.0,1.0_17_2025-08-21_06:39:12.326315.jpg differ diff --git a/saved_images/building_conf0.12_20250821_020337.jpg b/saved_images/building_conf0.12_20250821_020337.jpg new file mode 100644 index 0000000..b79a23f Binary files /dev/null and b/saved_images/building_conf0.12_20250821_020337.jpg differ diff --git a/saved_images/building_conf0.12_20250821_035120.jpg b/saved_images/building_conf0.12_20250821_035120.jpg new file mode 100644 index 0000000..6a12fd1 Binary files /dev/null and b/saved_images/building_conf0.12_20250821_035120.jpg differ diff --git a/saved_images/building_conf0.12_20250821_051246.jpg b/saved_images/building_conf0.12_20250821_051246.jpg new file mode 100644 index 0000000..39f028d Binary files /dev/null and b/saved_images/building_conf0.12_20250821_051246.jpg differ diff --git a/saved_images/building_conf0.12_20250821_060818.jpg b/saved_images/building_conf0.12_20250821_060818.jpg new file mode 100644 index 0000000..b9ec028 Binary files /dev/null and b/saved_images/building_conf0.12_20250821_060818.jpg differ diff --git a/saved_images/building_conf0.12_20250821_063900.jpg b/saved_images/building_conf0.12_20250821_063900.jpg new file mode 100644 index 0000000..99f6571 Binary files /dev/null and b/saved_images/building_conf0.12_20250821_063900.jpg differ diff --git a/saved_images/door_handle_conf0.99_20250821_035230.jpg b/saved_images/door_handle_conf0.99_20250821_035230.jpg new file mode 100644 index 0000000..4cd2ead Binary files /dev/null and b/saved_images/door_handle_conf0.99_20250821_035230.jpg differ diff --git a/saved_images/window_conf0.97_20250821_020355.jpg b/saved_images/window_conf0.97_20250821_020355.jpg new file mode 100644 index 0000000..264426e Binary files /dev/null and b/saved_images/window_conf0.97_20250821_020355.jpg differ diff --git a/saved_images/window_conf0.97_20250821_020407.jpg b/saved_images/window_conf0.97_20250821_020407.jpg new file mode 100644 index 0000000..4c9fd41 Binary files /dev/null and b/saved_images/window_conf0.97_20250821_020407.jpg differ diff --git a/saved_images/window_conf0.97_20250821_035139.jpg b/saved_images/window_conf0.97_20250821_035139.jpg new file mode 100644 index 0000000..c5f46ef Binary files /dev/null and b/saved_images/window_conf0.97_20250821_035139.jpg differ diff --git a/saved_images/window_conf0.97_20250821_035151.jpg b/saved_images/window_conf0.97_20250821_035151.jpg new file mode 100644 index 0000000..46a057f Binary files /dev/null and b/saved_images/window_conf0.97_20250821_035151.jpg differ diff --git a/saved_images/window_conf0.97_20250821_035204.jpg b/saved_images/window_conf0.97_20250821_035204.jpg new file mode 100644 index 0000000..edac6e6 Binary files /dev/null and b/saved_images/window_conf0.97_20250821_035204.jpg differ diff --git a/saved_images/window_conf0.97_20250821_051305.jpg b/saved_images/window_conf0.97_20250821_051305.jpg new file mode 100644 index 0000000..b0c2904 Binary files /dev/null and b/saved_images/window_conf0.97_20250821_051305.jpg differ diff --git a/saved_images/window_conf0.97_20250821_051317.jpg b/saved_images/window_conf0.97_20250821_051317.jpg new file mode 100644 index 0000000..7bb6086 Binary files /dev/null and b/saved_images/window_conf0.97_20250821_051317.jpg differ diff --git a/saved_images/window_conf0.97_20250821_051330.jpg b/saved_images/window_conf0.97_20250821_051330.jpg new file mode 100644 index 0000000..dd04d3e Binary files /dev/null and b/saved_images/window_conf0.97_20250821_051330.jpg differ diff --git a/saved_images/window_conf0.97_20250821_060837.jpg b/saved_images/window_conf0.97_20250821_060837.jpg new file mode 100644 index 0000000..6f6ad54 Binary files /dev/null and b/saved_images/window_conf0.97_20250821_060837.jpg differ diff --git a/saved_images/window_conf0.97_20250821_060849.jpg b/saved_images/window_conf0.97_20250821_060849.jpg new file mode 100644 index 0000000..b30c345 Binary files /dev/null and b/saved_images/window_conf0.97_20250821_060849.jpg differ diff --git a/saved_images/window_conf0.97_20250821_060902.jpg b/saved_images/window_conf0.97_20250821_060902.jpg new file mode 100644 index 0000000..76d7325 Binary files /dev/null and b/saved_images/window_conf0.97_20250821_060902.jpg differ diff --git a/source/alert_system.py b/source/alert_system.py new file mode 100644 index 0000000..05b3cb6 --- /dev/null +++ b/source/alert_system.py @@ -0,0 +1,30 @@ +import json +import os + +LISTS_FILE_PATH = "species_lists/species_lists.json" + +def _load_species_lists() -> dict: + """Loads all species lists from a single JSON file.""" + if not os.path.exists(LISTS_FILE_PATH): + return {} + with open(LISTS_FILE_PATH, 'r') as f: + data = json.load(f) + # Convert lists to sets for faster lookups + data['invasive_species'] = set(data.get('invasive_species', [])) + data['rare_species'] = set(data.get('rare_species', [])) + return data + +def check_for_alert(species_name: str) -> tuple: + """ + Checks if a species is on a list. + Returns a tuple of (alert_type, species_name) or (None, None). + """ + all_lists = _load_species_lists() + + if species_name in all_lists.get('invasive_species', set()): + return ("invasive_species", species_name) + + if species_name in all_lists.get('rare_species', set()): + return ("rare_species", species_name) + + return (None, None) \ No newline at end of file diff --git a/source/bring_data.py b/source/bring_data.py index d85444a..2502f75 100644 --- a/source/bring_data.py +++ b/source/bring_data.py @@ -65,7 +65,12 @@ def center_and_maximize_object(args, bbox, image, reward=None, label=None): except Exception as e: logger.error("Error when getting camera: %s", e) - _, _, zoom_level = Camera1.requesting_cameras_position_information() + try: + _, _, zoom_level = Camera1.requesting_cameras_position_information() + except Exception as e: + logger.warning("PTZ status query failed (%s). Falling back to last commanded zoom=%s.", e, args.zoom) + zoom_level = args.zoom # safe fallback + print(f'zoom_level: {zoom_level}') # Get current FOV based on zoom level @@ -83,12 +88,17 @@ def center_and_maximize_object(args, bbox, image, reward=None, label=None): print(f'Tilt: {tilt}') try: Camera1.relative_control(pan=pan, tilt=tilt) + time.sleep(0.6) except Exception as e: logger.error("Error when setting relative position: %s", e) # Calculate the current size of the bounding box bbox_width = x2 - x1 bbox_height = y2 - y1 + if bbox_width <= 1 or bbox_height <= 1: + logger.warning("BBox too small to zoom safely (w=%s h=%s). Skipping zoom.", bbox_width, bbox_height) + bbox_width = max(bbox_width, 2) + bbox_height = max(bbox_height, 2) # Calculate the zoom factor to maximize the object size zoom_factor_x = image_width / bbox_width @@ -104,6 +114,7 @@ def center_and_maximize_object(args, bbox, image, reward=None, label=None): current_zoom_factor = zoom_level / MZ target_zoom_factor = current_zoom_factor * zoom_factor relative_zoom = target_zoom_factor * (MZ - mz) - zoom_level + relative_zoom = max(-(MZ - mz), min(MZ - mz, relative_zoom)) print('current_zoom_factor: ', current_zoom_factor) print('target_zoom_factor: ', target_zoom_factor) @@ -112,6 +123,7 @@ def center_and_maximize_object(args, bbox, image, reward=None, label=None): print(f'Relative zoom: {relative_zoom}') try: Camera1.relative_control(pan=0, tilt=0, zoom=relative_zoom) + time.sleep(0.8) except Exception as e: logger.error("Error when setting relative position: %s", e) @@ -122,11 +134,14 @@ def center_and_maximize_object(args, bbox, image, reward=None, label=None): image_path = os.path.join(tmp_dir, filename) try: + tmp_dir.mkdir(exist_ok=True, mode=0o777) Camera1.snap_shot(image_path) + return image_path except Exception as e: logger.error("Error saving detection image: %s", e) + return None -def get_image_from_ptz_position(args, object_, pan, tilt, zoom, model, processor): +def get_image_from_ptz_position(args, object_, pan, tilt, zoom, model, processor, prompt_prefix: str = ""): try: Camera1 = camera_control.CameraControl( args.cameraip, args.username, args.password @@ -135,13 +150,24 @@ def get_image_from_ptz_position(args, object_, pan, tilt, zoom, model, processor logger.error("Error when getting camera: %s", e) Camera1.absolute_control(pan, tilt, zoom) + time.sleep(0.6) # settle before first shot tmp_dir.mkdir(exist_ok=True, mode=0o777) aux_image_path = grab_image(camera=Camera1, args=args, action=0) - image = Image.open(aux_image_path) - os.remove(aux_image_path) + if not aux_image_path or not os.path.exists(aux_image_path): + # No image captured; return early (no detection) + return None, None + + # Safely read and free the file before deletion + with Image.open(aux_image_path) as _im: + _im.load() # force read into memory + image = _im.copy() # detach from file handle + try: + os.remove(aux_image_path) + except Exception: + pass - detections = get_label_from_image_and_object(image, object_, model, processor) + detections = get_label_from_image_and_object(image, object_, model, prompt_prefix) if not detections: LABEL = None @@ -156,18 +182,17 @@ def get_image_from_ptz_position(args, object_, pan, tilt, zoom, model, processor } image_path = grab_image(camera=Camera1, args=args, action=random.randint(0,20)) + if not image_path or not os.path.exists(image_path): + return None, None return image_path, LABEL -def publish_images(): +def publish_images(keep=False): with Plugin() as plugin: ct = str(datetime.datetime.now()) for image_file in os.listdir(tmp_dir): - complete_path = os.path.join(tmp_dir, image_file) - print('Publishing') - print(complete_path) - plugin.upload_file(complete_path) - - shutil.rmtree(tmp_dir, ignore_errors=True) + plugin.upload_file(os.path.join(tmp_dir, image_file)) + if not keep: + shutil.rmtree(tmp_dir, ignore_errors=True) def get_fov_from_zoom(zoom_level): # Camera specifications diff --git a/source/object_detector.py b/source/object_detector.py index 9ac3628..0a6adf5 100644 --- a/source/object_detector.py +++ b/source/object_detector.py @@ -13,7 +13,7 @@ class ObjectDetector(ABC): """Abstract base class for object detection models""" @abstractmethod - def detect(self, image: Image.Image, target_objects: Union[str, List[str]]) -> Tuple[List[float], List[List[int]], List[str]]: + def detect(self, image: Image.Image, target_objects: Union[str, List[str]], prompt_prefix: str = "") -> Tuple[List[float], List[List[int]], List[str]]: """Detect objects in an image""" pass @@ -55,7 +55,7 @@ def load_model(self): print(f"Error loading model {self.model_name}: {str(e)}") raise RuntimeError(f"Failed to load YOLO model {self.model_name}. Error: {str(e)}") - def detect(self, image: Image.Image, target_objects: Union[str, List[str]]) -> Tuple[List[float], List[List[int]], List[str]]: + def detect(self, image: Image.Image, target_objects: Union[str, List[str]], prompt_prefix: str = "") -> Tuple[List[float], List[List[int]], List[str]]: """ Detect objects using YOLO Args: @@ -129,7 +129,7 @@ def load_model(self): self.model.eval() - def detect(self, image: Image.Image, target_objects: Union[str, List[str]]) -> Tuple[List[float], List[List[int]], List[str]]: + def detect(self, image: Image.Image, target_objects: Union[str, List[str]], prompt_prefix: str = "") -> Tuple[List[float], List[List[int]], List[str]]: """Detect objects using Florence""" # Generate text prompt from target objects provided. The special case # of * should allow Florence-2 to detect any object which seems to @@ -138,8 +138,12 @@ def detect(self, image: Image.Image, target_objects: Union[str, List[str]]) -> T text = "" task = "" elif isinstance(target_objects, list): + # We would add prompt_prefix as an argument to this function joined = " or ".join(target_objects) - text = f" {joined}" + if prompt_prefix: + text = f" {prompt_prefix} of a {joined}" + else: + text = f" {joined}" task = "" else: text = f" {target_objects}" @@ -192,6 +196,26 @@ def detect(self, image: Image.Image, target_objects: Union[str, List[str]]) -> T return rewards, bboxes, labels + def caption(self, image: Image.Image) -> str: + """ + Generates a detailed caption for a given image. + """ + task_prompt = '' + + # We use more tokens here to allow for longer, more descriptive captions. + inputs = self.processor(text=task_prompt, images=image, return_tensors="pt").to(self.device, self.torch_dtype) + generated_ids = self.model.generate( + input_ids=inputs["input_ids"], + pixel_values=inputs["pixel_values"], + max_new_tokens=1024, + num_beams=3 + ) + generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + + # The model output includes the prompt, so we split it off to get only the caption. + caption = generated_text.split(task_prompt)[-1].strip() + return caption + class DetectorFactory: """Factory class to create appropriate object detector""" @@ -295,13 +319,13 @@ def get_label_from_image_and_object( image: Image.Image, target_object: str, detector: ObjectDetector, - processor=None # Kept for backwards compatibility + prompt_prefix: str = "" ) -> List[Dict]: """ Unified interface for object detection Returns: List of dictionaries with 'reward', 'bbox', and 'label' keys """ - rewards, bboxes, labels = detector.detect(image, target_object) + rewards, bboxes, labels = detector.detect(image, target_object, prompt_prefix) # Convert to list of dictionaries results = [] diff --git a/source/plantnet_client.py b/source/plantnet_client.py new file mode 100644 index 0000000..fdc33bc --- /dev/null +++ b/source/plantnet_client.py @@ -0,0 +1,101 @@ +import os +import time +import json +import requests +from typing import Dict, Optional + +PLANTNET_API_KEY = os.getenv("PLANTNET_API_KEY", "") +BASE_URL = os.getenv("PLANTNET_BASE_URL", "https://my-api.plantnet.org") +IDENTIFY_PATH = "/v2/identify/all" # canonical, tolerant endpoint + +# Simple retry settings for transient server issues +_RETRY_COUNT = 2 +_RETRY_BACKOFF_SEC = 2.0 +_TIMEOUT_SEC = 45 + + +def _post_with_retry(url: str, params: dict, files, data: Optional[dict] = None) -> requests.Response: + last_exc = None + for attempt in range(_RETRY_COUNT + 1): + try: + return requests.post(url, params=params, files=files, data=data, timeout=_TIMEOUT_SEC) + except requests.RequestException as e: + last_exc = e + if attempt < _RETRY_COUNT: + time.sleep(_RETRY_BACKOFF_SEC * (attempt + 1)) + else: + raise + # Shouldn't reach here + raise last_exc # type: ignore + + +def identify_plant(image_path: str) -> Dict: + """ + Identify a plant from a single image using PlantNet. + + Returns a dict like: + { + "species": str | None, + "common_names": list[str], + "score": float, + "raw": + } + On failure or no match, returns {} (so callers can treat as "no result"). + """ + if not PLANTNET_API_KEY: + # Keep this a hard error so misconfiguration is obvious + raise RuntimeError("PLANTNET_API_KEY is not set") + + url = f"{BASE_URL}{IDENTIFY_PATH}" + params = {"api-key": PLANTNET_API_KEY} + + # IMPORTANT: do NOT send 'organs' for now (you requested to skip it) + # The API accepts multiple images under the same 'images' field; we send just one. + try: + with open(image_path, "rb") as f: + files = [("images", (os.path.basename(image_path), f, "image/jpeg"))] + resp = _post_with_retry(url, params=params, files=files) + except FileNotFoundError: + return {} + + # Graceful handling of common non-match response + if resp.status_code == 404 and "Species not found" in resp.text: + return {"species": None, "common_names": [], "score": 0.0, "raw": {"message": "no_match"}} + + # For all other statuses, bail out quietly (let the pipeline continue) + if not resp.ok: + # If you prefer raising here instead of swallowing, change to: resp.raise_for_status() + return {} + + try: + data = resp.json() + except json.JSONDecodeError: + return {} + + results = data.get("results") or [] + if not results: + return {"species": None, "common_names": [], "score": 0.0, "raw": {"message": "no_match"}} + + top = results[0] + score = float(top.get("score", 0.0)) + species_name = None + common = [] + + try: + species_name = top["species"].get("scientificNameWithoutAuthor") or top["species"].get("scientificName") + except Exception: + species_name = None + + try: + cn = top["species"].get("commonNames") or [] + if isinstance(cn, list): + common = cn + except Exception: + common = [] + + return { + "species": species_name, + "common_names": common, + "score": round(score, 4), + "raw": data, + } diff --git a/species_lists/species_lists.json b/species_lists/species_lists.json new file mode 100644 index 0000000..c050dfc --- /dev/null +++ b/species_lists/species_lists.json @@ -0,0 +1,9 @@ +{ + "invasive_species": [ + "Cirsium arvense", + "Alliaria petiolata" + ], + "rare_species": [ + "Cypripedium acaule" + ] + } \ No newline at end of file