save_activations.py

"""
Script to extract and save activations from ViT models (CLIP or OpenCLIP variants).
Supports both regular image directories and webdataset tar files.
"""

import os
import json
import torch
import argparse
import webdataset as wds
from pathlib import Path
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import h5py
from tqdm import tqdm
from typing import Optional, Tuple, Type
from ViT_wrapper import ViTWrapper, ModelConfig, CLIPLibrary, BlockType

class ImageDataset(Dataset):
    """Dataset for loading and preprocessing regular image files."""
    def __init__(self, data_path: str, transform: Optional[transforms.Compose] = None):
        self.data_path = Path(data_path)
        print(f"Searching for images in: {self.data_path}")
        
        self.image_files = [f for f in self.data_path.rglob("*") 
                    if f.suffix.lower() in {'.jpg', '.jpeg', '.png', '.JPEG'}]
        
        print(f"Found {len(self.image_files)} images")
        
        if transform is None:
            self.transform = transforms.Compose([
                transforms.Resize(224),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
                                  (0.26862954, 0.26130258, 0.27577711))
            ])
        else:
            self.transform = transform

    def __len__(self) -> int:
        return len(self.image_files)
        
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, str]:
        image_path = self.image_files[idx]
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, str(image_path)

class WebDatasetLoader:
    """Handles loading data from tar files using webdataset."""
    def __init__(self, data_path: str, transform: Optional[transforms.Compose] = None):
        self.data_path = Path(data_path)
        print(f"Initializing WebDataset loader for: {self.data_path}")
        
        if transform is None:
            self.transform = transforms.Compose([
                transforms.Resize(224),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
                                  (0.26862954, 0.26130258, 0.27577711))
            ])
        else:
            self.transform = transform
            
        # Calculate total number of samples by reading stats files
        self.total_samples = 0
        print("\nReading shard statistics:")
        for i in range(332):  # We know there are 332 shards
            stats_file = self.data_path / f"{i:05d}_stats.json"
            try:
                if stats_file.exists():
                    with open(stats_file) as f:
                        stats = json.load(f)
                        if isinstance(stats, dict):
                            if 'successes' in stats:
                                self.total_samples += stats['successes']
                                print(f"Shard {i:05d}: {stats['successes']} successful images")
                            else:
                                print(f"Warning: No 'successes' field in stats file {i:05d}")
            except Exception as e:
                print(f"Error reading stats file {stats_file}: {str(e)}")
                continue

        print(f"\nTotal successful samples across all shards: {self.total_samples}")
        
        if self.total_samples == 0:
            print("\nWARNING: Could not determine number of samples from stats files")
            print("Attempting to count samples in first shard...")
            try:
                # Try to open the first tar file and count entries
                first_tar = self.data_path / "00000.tar"
                if first_tar.exists():
                    import tarfile
                    with tarfile.open(first_tar) as tar:
                        img_count = len([f for f in tar.getmembers() if f.name.lower().endswith(('.jpg', '.jpeg', '.png'))])
                        print(f"Found {img_count} images in first shard")
                        self.total_samples = img_count * 332  # Approximate total
            except Exception as e:
                print(f"Error counting files in tar: {str(e)}")

        if self.total_samples == 0:
            raise ValueError("No samples found in dataset!")

        print(f"\nVerifying tar files:")
        for i in range(5):
            tar_path = self.data_path / f"{i:05d}.tar"
            exists = tar_path.exists()
            print(f"Shard {i:05d}: {'✓' if exists else '✗'}")
            
        if not any(self.data_path.glob("*.tar")):
            raise FileNotFoundError(f"No .tar files found in {self.data_path}")
        
    def get_length(self):
        """Returns the total number of samples across all shards."""
        return self.total_samples
        
    def get_dataset(self, batch_size: int, num_workers: int):
        """Creates a dataset pipeline using webdataset."""
        # Create the dataset with explicit urls
        urls = [str(self.data_path / f"{i:05d}.tar") for i in range(332)]
        print(f"Loading {len(urls)} tar files")
        
        # Debug first few URLs
        print(f"First few tar files: {urls[:5]}")
        
        dataset = wds.WebDataset(urls)\
            .decode("pilrgb")\
            .rename(image="jpg;png;jpeg")\
            .map(self.transform_sample)\
            .to_tuple("image", "__key__")\
            .batched(batch_size)
            
        dataloader = wds.WebLoader(
            dataset,
            batch_size=None,  # batch size handled by webdataset
            shuffle=False,
            num_workers=num_workers
        )
        
        return dataloader
        
    def transform_sample(self, sample):
        """Transform a single sample, ensuring image and key are present."""
        if 'image' not in sample:
            print(f"Warning: Missing image for key {sample.get('__key__', 'unknown')}")
            # Return a black image
            return {
                "image": torch.zeros((3, 224, 224)),
                "__key__": sample.get('__key__', 'missing')
            }
            
        image = self.transform(sample['image'])
        return {
            "image": image,
            "__key__": sample.get('__key__', 'unknown')
        }

def get_dataset_type(data_path: str) -> str:
    """Detect whether the dataset is a regular image directory or tar files."""
    path = Path(data_path)
    if any(f.suffix == '.tar' for f in path.glob('*.tar')):
        return 'webdataset'
    return 'images'

def save_activations(args):
    """Main function to extract and save model activations."""
    
    # Create output directory if it doesn't exist
    output_path = Path(args.output_path) / \
                  args.dataset_path.split('/')[-2] / \
                  args.split / \
                  args.model_library / \
                  args.model_version / \
                  args.token_to_save / \
                  str(args.block_index) / \
                  args.block_element
    output_path.mkdir(parents=True, exist_ok=True) 

    print(output_path)

    # Configure model
    config = ModelConfig(
        library=CLIPLibrary(args.model_library),
        model_name=args.model_version,
        pretrained=args.pretrained if hasattr(args, 'pretrained') else None,
        device=args.device
    )

    print('Model configured.')
    
    # Initialize model wrapper
    model = ViTWrapper(config)
    model.eval()

    print('Model loaded.')
    
    # Register hook for activation extraction
    block_type = BlockType(args.block_element)
    model.register_activation_hook(
        block_idx=args.block_index,
        block_type=block_type
    )

    print('Hook registered.')
    
    # Check if dataset path exists
    dataset_path = Path(args.dataset_path)
    if not dataset_path.exists():
        raise FileNotFoundError(f"Dataset path does not exist: {dataset_path}")
    
    print(f"Dataset path: {dataset_path}")
    
    # Detect dataset type and setup appropriate loader
    dataset_type = get_dataset_type(args.dataset_path)
    print(f"Detected dataset type: {dataset_type}")
    
    if dataset_type == 'webdataset':
        # Setup webdataset loader
        dataset_loader = WebDatasetLoader(args.dataset_path)
        dataloader = dataset_loader.get_dataset(
            batch_size=args.batch_size,
            num_workers=args.num_workers
        )
        total_samples = dataset_loader.get_length()
    else:
        # Setup regular image dataset
        dataset = ImageDataset(args.dataset_path)
        dataloader = DataLoader(
            dataset,
            batch_size=args.batch_size,
            num_workers=args.num_workers,
            pin_memory=True
        )
        total_samples = len(dataset)
    
    print('Dataset and dataloader created.')
    print(f'Total samples: {total_samples}')
    
    # Create HDF5 file for saving activations
    activation_file = output_path / f"activations_{args.model_version}_{args.block_index}_{args.block_element}.h5"

    with h5py.File(activation_file, 'w') as f:
        print('Extracting activations...')
    
        # Get the activation shape from a single batch
        sample_batch = next(iter(dataloader))
        images, paths = sample_batch  # Now we get tuples directly
        
        # Stack images if they're in a list
        if isinstance(images, list):
            images = torch.stack(images)
            
        images = images.to(args.device)
        with torch.no_grad():
            _ = model(images)
            batch_activations = model.get_activations()
            key = f"block_{args.block_index}_{args.block_element}"
            sample_activations = batch_activations[key].cpu().numpy()
        model.clear_activations()
    
        # Create datasets
        activations_dataset = f.create_dataset(
            'activations',
            shape=(total_samples, *sample_activations.shape[1:]),
            dtype='float32',
            chunks=True
        )
        paths_dataset = f.create_dataset(
            'paths',
            shape=(total_samples,),
            dtype=h5py.special_dtype(vlen=str)
        )
    
        # Main extraction loop
        with torch.no_grad():
            for batch_idx, batch_data in enumerate(tqdm(dataloader)):
                if batch_idx % 10 == 0:  # Log every 10 batches
                    tqdm.write(f"Processing batch {batch_idx}")

                # Handle different dataset formats
                if dataset_type == 'webdataset':
                    images, paths = batch_data  # Now we get tuples directly
                    # Stack images if they're in a list
                    if isinstance(images, list):
                        images = torch.stack(images)
                else:
                    images, paths = batch_data

                # Move images to device
                images = images.to(args.device)

                # Forward pass
                _ = model(images)

                # Get activations
                batch_activations = model.get_activations()
                key = f"block_{args.block_index}_{args.block_element}"
                activations = batch_activations[key].cpu().numpy()

                # Calculate indices for this batch
                start_idx = batch_idx * args.batch_size
                end_idx = start_idx + activations.shape[0]

                # Save activations and paths
                activations_dataset[start_idx:end_idx] = activations
                paths_dataset[start_idx:end_idx] = paths

                # Clear stored activations
                model.clear_activations()
    
        # Clean up
        model.remove_hooks()

    print(f'Activations saved to {activation_file}')
    print('Done.')

def main():
    parser = argparse.ArgumentParser(description='Save model activations from ViT models')
    
    # Model configuration
    parser.add_argument('--model-library', type=str, choices=['clip', 'open_clip'],
                      help='Which CLIP library to use')
    parser.add_argument('--token-to-save', type=str, choices=['cls', 'highest_norm'],
                      help='Which transformer token to save activations from', default='cls')
    parser.add_argument('--model-version', type=str,
                      help='Model version/name. For OpenAI CLIP: "ViT-B/32", "ViT-L/14", or "ViT-L/14@336px". For OpenCLIP: check open_clip documentation')
    parser.add_argument('--pretrained', type=str,
                      help='Pretrained model name for OpenCLIP')
    
    # Activation extraction configuration
    parser.add_argument('--block-index', type=int,
                      help='Index of transformer block to extract from')
    parser.add_argument('--block-element', type=str,
                      choices=['attention', 'mlp', 'residual', 'output'],
                      help='Element of the block to extract')
    
    # Data and output configuration
    parser.add_argument('--dataset-path', type=str,
                      help='Path to dataset directory')
    parser.add_argument('--split', type=str, choices=['train', 'val', 'test'],
                      help='Dataset split to process')
    parser.add_argument('--output-path', type=str,
                      help='Path to save activations')
    
    # Runtime configuration
    parser.add_argument('--batch-size', type=int, default=32,
                      help='Batch size for processing')
    parser.add_argument('--num-workers', type=int, default=4,
                      help='Number of dataloader workers')
    parser.add_argument('--device', type=str, default='cuda',
                      help='Device to run on (cuda/cpu)')
    
    args = parser.parse_args()
    args.dataset_path = os.path.join(args.dataset_path, args.split)
    save_activations(args)

if __name__ == '__main__':
    '''
    Run example: 
    python save_activations.py \
    --model-library clip \
    --model-version ViT-B/32 \
    --block-index 5 \
    --block-element mlp \
    --dataset-path /path/to/dataset \
    --output-path /path/to/output \
    --batch-size 64 \
    --num-workers 8 
    '''

    main()