Skip to content

Latest commit

 

History

History
820 lines (672 loc) · 17.1 KB

File metadata and controls

820 lines (672 loc) · 17.1 KB

⚡ Performance Optimization Guide - Achieving Silicon Nirvana

Speed is not just a feature, it's a philosophy. Memory is not just storage, it's art.

Table of Contents

Performance Principles

The Trinity of Speed

    Speed
      △
     /│\
    / │ \
   /  │  \
  /   │   \
 /    │    \
Memory─────Accuracy

Every optimization is a trade-off. Choose your battles wisely.

Performance Metrics That Matter

const keyMetrics = {
  // User-facing metrics
  timeToFirstToken: '< 100ms',    // How fast until first output
  tokensPerSecond: '> 30',         // Generation speed
  endToEndLatency: '< 1s',         // Total request time
  
  // System metrics
  memoryUsage: '< 50% of model size',
  cpuUtilization: '< 80%',
  gpuUtilization: '> 90% during inference',
  cacheHitRate: '> 80%',
  
  // Scale metrics
  concurrentRequests: '> 100',
  requestsPerSecond: '> 1000',
  p99Latency: '< 2s'
};

Benchmarking

Performance Testing Suite

// benchmark.js
import { Benchmark } from 'llm-runner-router/benchmark';

const suite = new Benchmark({
  models: ['llama-7b', 'mistral-7b'],
  prompts: [
    'Short prompt',
    'Medium length prompt with more context...',
    'Very long prompt with extensive context...'
  ],
  iterations: 100,
  warmup: 10
});

const results = await suite.run();
console.log(results);

// Output:
// {
//   'llama-7b': {
//     avgLatency: 245,
//     p50: 240,
//     p95: 290,
//     p99: 320,
//     tokensPerSec: 32
//   }
// }

Real-world Benchmark Scenarios

// Scenario 1: High-throughput API
async function benchmarkAPI() {
  const promises = [];
  const startTime = Date.now();
  
  // Simulate 1000 concurrent requests
  for (let i = 0; i < 1000; i++) {
    promises.push(
      router.quick(`Question ${i}`, { maxTokens: 50 })
    );
  }
  
  await Promise.all(promises);
  const duration = Date.now() - startTime;
  
  console.log(`1000 requests in ${duration}ms`);
  console.log(`RPS: ${1000 / (duration / 1000)}`);
}

// Scenario 2: Streaming performance
async function benchmarkStreaming() {
  const stream = router.stream('Write a long story');
  let tokenCount = 0;
  const startTime = Date.now();
  
  for await (const token of stream) {
    tokenCount++;
  }
  
  const duration = Date.now() - startTime;
  console.log(`Tokens/sec: ${tokenCount / (duration / 1000)}`);
}

Memory Optimization

Memory Management Strategies

// 1. Lazy Loading
const lazyLoadConfig = {
  modelPreload: [],  // Don't preload any models
  loadOnDemand: true,
  unloadAfterIdle: 300000  // 5 minutes
};

// 2. Memory Mapping
const mmapConfig = {
  useMmap: true,     // Memory-mapped files
  mlockModels: true, // Lock critical models in RAM
  swapPath: '/tmp/llm-swap'  // Swap space for overflow
};

// 3. Quantization
const quantConfig = {
  defaultQuantization: 'q4_k_m',
  dynamicQuantization: true,
  targetMemory: '4GB'
};

Memory Profiling

class MemoryProfiler {
  constructor(router) {
    this.router = router;
    this.baseline = process.memoryUsage();
  }
  
  profile() {
    const current = process.memoryUsage();
    return {
      heapUsed: this.formatBytes(current.heapUsed),
      heapTotal: this.formatBytes(current.heapTotal),
      external: this.formatBytes(current.external),
      delta: this.formatBytes(current.heapUsed - this.baseline.heapUsed)
    };
  }
  
  formatBytes(bytes) {
    return `${(bytes / 1024 / 1024).toFixed(2)} MB`;
  }
  
  async trackOperation(name, fn) {
    const before = process.memoryUsage();
    const result = await fn();
    const after = process.memoryUsage();
    
    console.log(`${name} memory impact:`, {
      heap: this.formatBytes(after.heapUsed - before.heapUsed),
      external: this.formatBytes(after.external - before.external)
    });
    
    return result;
  }
}

Memory Leak Detection

// Detect memory leaks
const heapSnapshots = [];

setInterval(() => {
  const usage = process.memoryUsage();
  heapSnapshots.push(usage.heapUsed);
  
  // Keep last 100 snapshots
  if (heapSnapshots.length > 100) {
    heapSnapshots.shift();
  }
  
  // Check for consistent growth
  if (heapSnapshots.length === 100) {
    const trend = this.calculateTrend(heapSnapshots);
    if (trend > 0.5) {  // 50% growth
      console.warn('Possible memory leak detected!');
      this.dumpHeapSnapshot();
    }
  }
}, 60000);  // Check every minute

Latency Reduction

First Token Optimization

// Optimize time to first token
const firstTokenOptimizations = {
  // 1. Model warmup
  async warmupModels() {
    for (const model of this.preloadModels) {
      await model.generate('test', { maxTokens: 1 });
    }
  },
  
  // 2. Prefetch common prompts
  async prefetchCommon() {
    const commonPrompts = [
      'Hello',
      'Can you help me',
      'Explain'
    ];
    
    for (const prompt of commonPrompts) {
      await this.cache.prefetch(prompt);
    }
  },
  
  // 3. Connection pooling
  connectionPool: {
    min: 5,
    max: 20,
    idleTimeout: 60000
  }
};

Pipeline Optimization

// Optimize inference pipeline
class OptimizedPipeline {
  constructor() {
    // Pre-compile regex patterns
    this.patterns = {
      tokenBoundary: /\b/g,
      sentence: /[.!?]+/g
    };
    
    // Pre-allocate buffers
    this.buffers = {
      input: new Float32Array(4096),
      output: new Float32Array(4096)
    };
  }
  
  async process(model, prompt, options) {
    // Parallel preprocessing
    const [tokens, embedding, metadata] = await Promise.all([
      this.tokenize(prompt),
      this.embed(prompt),
      this.extractMetadata(prompt)
    ]);
    
    // Reuse buffers
    this.buffers.input.set(tokens);
    
    // Stream processing
    return this.streamProcess(model, this.buffers.input, options);
  }
}

Throughput Maximization

Batching Strategies

class BatchProcessor {
  constructor(router) {
    this.router = router;
    this.batch = [];
    this.batchSize = 32;
    this.batchTimeout = 50;  // ms
    this.processing = false;
  }
  
  async add(prompt, options) {
    return new Promise((resolve, reject) => {
      this.batch.push({ prompt, options, resolve, reject });
      
      if (this.batch.length >= this.batchSize) {
        this.processBatch();
      } else {
        this.scheduleBatch();
      }
    });
  }
  
  scheduleBatch() {
    if (this.timer) return;
    
    this.timer = setTimeout(() => {
      this.processBatch();
    }, this.batchTimeout);
  }
  
  async processBatch() {
    if (this.processing || this.batch.length === 0) return;
    
    this.processing = true;
    clearTimeout(this.timer);
    this.timer = null;
    
    const currentBatch = this.batch.splice(0, this.batchSize);
    
    try {
      // Process batch in parallel
      const results = await this.router.batch(
        currentBatch.map(item => item.prompt),
        currentBatch[0].options
      );
      
      // Resolve promises
      currentBatch.forEach((item, i) => {
        item.resolve(results[i]);
      });
    } catch (error) {
      currentBatch.forEach(item => item.reject(error));
    }
    
    this.processing = false;
    
    // Process remaining if any
    if (this.batch.length > 0) {
      this.processBatch();
    }
  }
}

Worker Pool

// Use worker threads for CPU-intensive tasks
import { Worker } from 'worker_threads';

class WorkerPool {
  constructor(size = 4) {
    this.workers = [];
    this.queue = [];
    
    // Create workers
    for (let i = 0; i < size; i++) {
      this.createWorker();
    }
  }
  
  createWorker() {
    const worker = new Worker('./inference-worker.js');
    
    worker.on('message', (result) => {
      const task = this.queue.shift();
      if (task) {
        task.resolve(result);
      }
      
      // Process next in queue
      if (this.queue.length > 0) {
        const next = this.queue[0];
        worker.postMessage(next.data);
      } else {
        worker.idle = true;
      }
    });
    
    worker.idle = true;
    this.workers.push(worker);
  }
  
  async process(data) {
    return new Promise((resolve, reject) => {
      // Find idle worker
      const worker = this.workers.find(w => w.idle);
      
      if (worker) {
        worker.idle = false;
        worker.postMessage(data);
        this.queue.push({ resolve, reject, data });
      } else {
        // Queue for later
        this.queue.push({ resolve, reject, data });
      }
    });
  }
}

Caching Strategies

Multi-tier Cache

class MultiTierCache {
  constructor() {
    // L1: In-memory (hot)
    this.l1 = new Map();
    this.l1Size = 100;
    
    // L2: Redis (warm)
    this.l2 = new RedisCache();
    
    // L3: Disk (cold)
    this.l3 = new DiskCache('/var/cache/llm');
    
    this.stats = {
      l1Hits: 0,
      l2Hits: 0,
      l3Hits: 0,
      misses: 0
    };
  }
  
  async get(key) {
    // Check L1
    if (this.l1.has(key)) {
      this.stats.l1Hits++;
      return this.l1.get(key);
    }
    
    // Check L2
    const l2Value = await this.l2.get(key);
    if (l2Value) {
      this.stats.l2Hits++;
      this.promote(key, l2Value, 1);
      return l2Value;
    }
    
    // Check L3
    const l3Value = await this.l3.get(key);
    if (l3Value) {
      this.stats.l3Hits++;
      this.promote(key, l3Value, 2);
      return l3Value;
    }
    
    this.stats.misses++;
    return null;
  }
  
  async set(key, value, ttl = 3600) {
    // Write to L1
    this.l1.set(key, value);
    this.evictLRU();
    
    // Write to L2 async
    this.l2.set(key, value, ttl).catch(console.error);
    
    // Write to L3 for persistence
    this.l3.set(key, value).catch(console.error);
  }
  
  promote(key, value, fromLevel) {
    if (fromLevel >= 1) {
      this.l1.set(key, value);
      this.evictLRU();
    }
    if (fromLevel >= 2) {
      this.l2.set(key, value).catch(console.error);
    }
  }
  
  evictLRU() {
    if (this.l1.size > this.l1Size) {
      const firstKey = this.l1.keys().next().value;
      this.l1.delete(firstKey);
    }
  }
}

Smart Cache Invalidation

class SmartCache {
  constructor() {
    this.cache = new Map();
    this.dependencies = new Map();
  }
  
  set(key, value, dependencies = []) {
    this.cache.set(key, {
      value,
      timestamp: Date.now(),
      hits: 0
    });
    
    // Track dependencies
    for (const dep of dependencies) {
      if (!this.dependencies.has(dep)) {
        this.dependencies.set(dep, new Set());
      }
      this.dependencies.get(dep).add(key);
    }
  }
  
  invalidate(dependency) {
    // Invalidate all dependent entries
    const dependent = this.dependencies.get(dependency);
    if (dependent) {
      for (const key of dependent) {
        this.cache.delete(key);
      }
      this.dependencies.delete(dependency);
    }
  }
  
  // Adaptive TTL based on usage patterns
  getAdaptiveTTL(key) {
    const entry = this.cache.get(key);
    if (!entry) return 3600;  // Default 1 hour
    
    const age = Date.now() - entry.timestamp;
    const hitRate = entry.hits / (age / 1000);
    
    // High hit rate = longer TTL
    if (hitRate > 1) return 7200;    // 2 hours
    if (hitRate > 0.5) return 3600;  // 1 hour
    if (hitRate > 0.1) return 1800;  // 30 minutes
    return 600;  // 10 minutes
  }
}

Engine Optimization

WebGPU Optimization

// WebGPU-specific optimizations
const webGPUOptimizations = {
  // Use compute shaders efficiently
  shaderOptimization: {
    workgroupSize: 64,
    useSharedMemory: true,
    unrollLoops: true
  },
  
  // Buffer management
  bufferStrategy: {
    reuseBuffers: true,
    doubleBuffering: true,
    alignmentSize: 256
  },
  
  // Pipeline caching
  pipelineCache: {
    enabled: true,
    maxSize: 100
  }
};

WASM Optimization

// WASM-specific optimizations
const wasmOptimizations = {
  // Memory configuration
  memory: {
    initial: 256,  // 256 pages (16MB)
    maximum: 4096, // 4096 pages (256MB)
    shared: true
  },
  
  // SIMD acceleration
  simd: {
    enabled: true,
    autoVectorize: true
  },
  
  // Threading
  threading: {
    enabled: true,
    workers: navigator.hardwareConcurrency || 4
  }
};

Model Optimization

Quantization Strategies

// Dynamic quantization based on available resources
async function selectQuantization(model, availableMemory) {
  const modelSize = model.parameters.size;
  const quantizationLevels = [
    { level: 'f32', factor: 1.0 },
    { level: 'f16', factor: 0.5 },
    { level: 'q8_0', factor: 0.25 },
    { level: 'q5_k_m', factor: 0.175 },
    { level: 'q4_k_m', factor: 0.14 },
    { level: 'q3_k_m', factor: 0.11 },
    { level: 'q2_k', factor: 0.08 }
  ];
  
  for (const quant of quantizationLevels) {
    const requiredMemory = modelSize * quant.factor;
    if (requiredMemory <= availableMemory) {
      return quant.level;
    }
  }
  
  throw new Error('Model too large for available memory');
}

Model Pruning

// Remove unnecessary model components
class ModelPruner {
  async prune(model, options = {}) {
    const pruned = { ...model };
    
    // Remove unused layers
    if (options.removeUnusedLayers) {
      pruned.layers = this.identifyUsedLayers(model);
    }
    
    // Reduce vocabulary size
    if (options.reduceVocabulary) {
      pruned.vocabulary = this.pruneVocabulary(model.vocabulary);
    }
    
    // Compress embeddings
    if (options.compressEmbeddings) {
      pruned.embeddings = await this.compressEmbeddings(model.embeddings);
    }
    
    return pruned;
  }
}

Network Optimization

Request Compression

// Compress requests and responses
import zlib from 'zlib';

const compressionMiddleware = (req, res, next) => {
  // Compress responses
  res.setHeader('Content-Encoding', 'gzip');
  const originalSend = res.send;
  
  res.send = function(data) {
    zlib.gzip(data, (err, compressed) => {
      if (err) {
        return originalSend.call(this, data);
      }
      res.setHeader('Content-Length', compressed.length);
      originalSend.call(this, compressed);
    });
  };
  
  // Decompress requests
  if (req.headers['content-encoding'] === 'gzip') {
    const chunks = [];
    req.on('data', chunk => chunks.push(chunk));
    req.on('end', () => {
      const buffer = Buffer.concat(chunks);
      zlib.gunzip(buffer, (err, decompressed) => {
        if (!err) {
          req.body = JSON.parse(decompressed.toString());
        }
        next();
      });
    });
  } else {
    next();
  }
};

Connection Pooling

// HTTP/2 connection pooling
import http2 from 'http2';

class ConnectionPool {
  constructor(maxConnections = 10) {
    this.connections = new Map();
    this.maxConnections = maxConnections;
  }
  
  getConnection(origin) {
    if (!this.connections.has(origin)) {
      const session = http2.connect(origin, {
        maxSessionMemory: 10,
        settings: {
          enablePush: false,
          initialWindowSize: 1024 * 1024
        }
      });
      
      this.connections.set(origin, session);
      
      // Clean up on error
      session.on('error', () => {
        this.connections.delete(origin);
      });
    }
    
    return this.connections.get(origin);
  }
}

Production Tuning

Configuration for Scale

// production.config.js
export default {
  // Process configuration
  cluster: {
    workers: 'auto',  // Use all CPU cores
    restart: true,
    maxMemory: '4G'
  },
  
  // Model configuration
  models: {
    preload: ['llama-7b-q4'],
    maxLoaded: 5,
    unloadTimeout: 300000
  },
  
  // Cache configuration
  cache: {
    type: 'redis',
    maxSize: '10GB',
    ttl: 7200,
    compression: true
  },
  
  // Performance settings
  performance: {
    maxConcurrent: 100,
    queueSize: 1000,
    timeout: 30000,
    batchSize: 32
  },
  
  // Monitoring
  monitoring: {
    metrics: true,
    tracing: true,
    profiling: false  // Enable only when debugging
  }
};

Health Checks

// Health check endpoint
app.get('/health', async (req, res) => {
  const health = {
    status: 'healthy',
    timestamp: new Date(),
    uptime: process.uptime(),
    memory: process.memoryUsage(),
    models: router.getLoadedModels(),
    metrics: router.getMetrics()
  };
  
  // Check critical metrics
  if (health.memory.heapUsed > 0.9 * health.memory.heapTotal) {
    health.status = 'degraded';
    health.warnings = ['High memory usage'];
  }
  
  if (health.metrics.errorRate > 0.05) {
    health.status = 'unhealthy';
    health.errors = ['High error rate'];
  }
  
  res.status(health.status === 'healthy' ? 200 : 503);
  res.json(health);
});

Performance is not achieved, it's crafted. Every millisecond saved is a victory! 🚀

Built with 💙 by Echo AI Systems