Speed is not just a feature, it's a philosophy. Memory is not just storage, it's art.
- Performance Principles
- Benchmarking
- Memory Optimization
- Latency Reduction
- Throughput Maximization
- Caching Strategies
- Engine Optimization
- Model Optimization
- Network Optimization
- Production Tuning
Speed
△
/│\
/ │ \
/ │ \
/ │ \
/ │ \
Memory─────Accuracy
Every optimization is a trade-off. Choose your battles wisely.
const keyMetrics = {
// User-facing metrics
timeToFirstToken: '< 100ms', // How fast until first output
tokensPerSecond: '> 30', // Generation speed
endToEndLatency: '< 1s', // Total request time
// System metrics
memoryUsage: '< 50% of model size',
cpuUtilization: '< 80%',
gpuUtilization: '> 90% during inference',
cacheHitRate: '> 80%',
// Scale metrics
concurrentRequests: '> 100',
requestsPerSecond: '> 1000',
p99Latency: '< 2s'
};// benchmark.js
import { Benchmark } from 'llm-runner-router/benchmark';
const suite = new Benchmark({
models: ['llama-7b', 'mistral-7b'],
prompts: [
'Short prompt',
'Medium length prompt with more context...',
'Very long prompt with extensive context...'
],
iterations: 100,
warmup: 10
});
const results = await suite.run();
console.log(results);
// Output:
// {
// 'llama-7b': {
// avgLatency: 245,
// p50: 240,
// p95: 290,
// p99: 320,
// tokensPerSec: 32
// }
// }// Scenario 1: High-throughput API
async function benchmarkAPI() {
const promises = [];
const startTime = Date.now();
// Simulate 1000 concurrent requests
for (let i = 0; i < 1000; i++) {
promises.push(
router.quick(`Question ${i}`, { maxTokens: 50 })
);
}
await Promise.all(promises);
const duration = Date.now() - startTime;
console.log(`1000 requests in ${duration}ms`);
console.log(`RPS: ${1000 / (duration / 1000)}`);
}
// Scenario 2: Streaming performance
async function benchmarkStreaming() {
const stream = router.stream('Write a long story');
let tokenCount = 0;
const startTime = Date.now();
for await (const token of stream) {
tokenCount++;
}
const duration = Date.now() - startTime;
console.log(`Tokens/sec: ${tokenCount / (duration / 1000)}`);
}// 1. Lazy Loading
const lazyLoadConfig = {
modelPreload: [], // Don't preload any models
loadOnDemand: true,
unloadAfterIdle: 300000 // 5 minutes
};
// 2. Memory Mapping
const mmapConfig = {
useMmap: true, // Memory-mapped files
mlockModels: true, // Lock critical models in RAM
swapPath: '/tmp/llm-swap' // Swap space for overflow
};
// 3. Quantization
const quantConfig = {
defaultQuantization: 'q4_k_m',
dynamicQuantization: true,
targetMemory: '4GB'
};class MemoryProfiler {
constructor(router) {
this.router = router;
this.baseline = process.memoryUsage();
}
profile() {
const current = process.memoryUsage();
return {
heapUsed: this.formatBytes(current.heapUsed),
heapTotal: this.formatBytes(current.heapTotal),
external: this.formatBytes(current.external),
delta: this.formatBytes(current.heapUsed - this.baseline.heapUsed)
};
}
formatBytes(bytes) {
return `${(bytes / 1024 / 1024).toFixed(2)} MB`;
}
async trackOperation(name, fn) {
const before = process.memoryUsage();
const result = await fn();
const after = process.memoryUsage();
console.log(`${name} memory impact:`, {
heap: this.formatBytes(after.heapUsed - before.heapUsed),
external: this.formatBytes(after.external - before.external)
});
return result;
}
}// Detect memory leaks
const heapSnapshots = [];
setInterval(() => {
const usage = process.memoryUsage();
heapSnapshots.push(usage.heapUsed);
// Keep last 100 snapshots
if (heapSnapshots.length > 100) {
heapSnapshots.shift();
}
// Check for consistent growth
if (heapSnapshots.length === 100) {
const trend = this.calculateTrend(heapSnapshots);
if (trend > 0.5) { // 50% growth
console.warn('Possible memory leak detected!');
this.dumpHeapSnapshot();
}
}
}, 60000); // Check every minute// Optimize time to first token
const firstTokenOptimizations = {
// 1. Model warmup
async warmupModels() {
for (const model of this.preloadModels) {
await model.generate('test', { maxTokens: 1 });
}
},
// 2. Prefetch common prompts
async prefetchCommon() {
const commonPrompts = [
'Hello',
'Can you help me',
'Explain'
];
for (const prompt of commonPrompts) {
await this.cache.prefetch(prompt);
}
},
// 3. Connection pooling
connectionPool: {
min: 5,
max: 20,
idleTimeout: 60000
}
};// Optimize inference pipeline
class OptimizedPipeline {
constructor() {
// Pre-compile regex patterns
this.patterns = {
tokenBoundary: /\b/g,
sentence: /[.!?]+/g
};
// Pre-allocate buffers
this.buffers = {
input: new Float32Array(4096),
output: new Float32Array(4096)
};
}
async process(model, prompt, options) {
// Parallel preprocessing
const [tokens, embedding, metadata] = await Promise.all([
this.tokenize(prompt),
this.embed(prompt),
this.extractMetadata(prompt)
]);
// Reuse buffers
this.buffers.input.set(tokens);
// Stream processing
return this.streamProcess(model, this.buffers.input, options);
}
}class BatchProcessor {
constructor(router) {
this.router = router;
this.batch = [];
this.batchSize = 32;
this.batchTimeout = 50; // ms
this.processing = false;
}
async add(prompt, options) {
return new Promise((resolve, reject) => {
this.batch.push({ prompt, options, resolve, reject });
if (this.batch.length >= this.batchSize) {
this.processBatch();
} else {
this.scheduleBatch();
}
});
}
scheduleBatch() {
if (this.timer) return;
this.timer = setTimeout(() => {
this.processBatch();
}, this.batchTimeout);
}
async processBatch() {
if (this.processing || this.batch.length === 0) return;
this.processing = true;
clearTimeout(this.timer);
this.timer = null;
const currentBatch = this.batch.splice(0, this.batchSize);
try {
// Process batch in parallel
const results = await this.router.batch(
currentBatch.map(item => item.prompt),
currentBatch[0].options
);
// Resolve promises
currentBatch.forEach((item, i) => {
item.resolve(results[i]);
});
} catch (error) {
currentBatch.forEach(item => item.reject(error));
}
this.processing = false;
// Process remaining if any
if (this.batch.length > 0) {
this.processBatch();
}
}
}// Use worker threads for CPU-intensive tasks
import { Worker } from 'worker_threads';
class WorkerPool {
constructor(size = 4) {
this.workers = [];
this.queue = [];
// Create workers
for (let i = 0; i < size; i++) {
this.createWorker();
}
}
createWorker() {
const worker = new Worker('./inference-worker.js');
worker.on('message', (result) => {
const task = this.queue.shift();
if (task) {
task.resolve(result);
}
// Process next in queue
if (this.queue.length > 0) {
const next = this.queue[0];
worker.postMessage(next.data);
} else {
worker.idle = true;
}
});
worker.idle = true;
this.workers.push(worker);
}
async process(data) {
return new Promise((resolve, reject) => {
// Find idle worker
const worker = this.workers.find(w => w.idle);
if (worker) {
worker.idle = false;
worker.postMessage(data);
this.queue.push({ resolve, reject, data });
} else {
// Queue for later
this.queue.push({ resolve, reject, data });
}
});
}
}class MultiTierCache {
constructor() {
// L1: In-memory (hot)
this.l1 = new Map();
this.l1Size = 100;
// L2: Redis (warm)
this.l2 = new RedisCache();
// L3: Disk (cold)
this.l3 = new DiskCache('/var/cache/llm');
this.stats = {
l1Hits: 0,
l2Hits: 0,
l3Hits: 0,
misses: 0
};
}
async get(key) {
// Check L1
if (this.l1.has(key)) {
this.stats.l1Hits++;
return this.l1.get(key);
}
// Check L2
const l2Value = await this.l2.get(key);
if (l2Value) {
this.stats.l2Hits++;
this.promote(key, l2Value, 1);
return l2Value;
}
// Check L3
const l3Value = await this.l3.get(key);
if (l3Value) {
this.stats.l3Hits++;
this.promote(key, l3Value, 2);
return l3Value;
}
this.stats.misses++;
return null;
}
async set(key, value, ttl = 3600) {
// Write to L1
this.l1.set(key, value);
this.evictLRU();
// Write to L2 async
this.l2.set(key, value, ttl).catch(console.error);
// Write to L3 for persistence
this.l3.set(key, value).catch(console.error);
}
promote(key, value, fromLevel) {
if (fromLevel >= 1) {
this.l1.set(key, value);
this.evictLRU();
}
if (fromLevel >= 2) {
this.l2.set(key, value).catch(console.error);
}
}
evictLRU() {
if (this.l1.size > this.l1Size) {
const firstKey = this.l1.keys().next().value;
this.l1.delete(firstKey);
}
}
}class SmartCache {
constructor() {
this.cache = new Map();
this.dependencies = new Map();
}
set(key, value, dependencies = []) {
this.cache.set(key, {
value,
timestamp: Date.now(),
hits: 0
});
// Track dependencies
for (const dep of dependencies) {
if (!this.dependencies.has(dep)) {
this.dependencies.set(dep, new Set());
}
this.dependencies.get(dep).add(key);
}
}
invalidate(dependency) {
// Invalidate all dependent entries
const dependent = this.dependencies.get(dependency);
if (dependent) {
for (const key of dependent) {
this.cache.delete(key);
}
this.dependencies.delete(dependency);
}
}
// Adaptive TTL based on usage patterns
getAdaptiveTTL(key) {
const entry = this.cache.get(key);
if (!entry) return 3600; // Default 1 hour
const age = Date.now() - entry.timestamp;
const hitRate = entry.hits / (age / 1000);
// High hit rate = longer TTL
if (hitRate > 1) return 7200; // 2 hours
if (hitRate > 0.5) return 3600; // 1 hour
if (hitRate > 0.1) return 1800; // 30 minutes
return 600; // 10 minutes
}
}// WebGPU-specific optimizations
const webGPUOptimizations = {
// Use compute shaders efficiently
shaderOptimization: {
workgroupSize: 64,
useSharedMemory: true,
unrollLoops: true
},
// Buffer management
bufferStrategy: {
reuseBuffers: true,
doubleBuffering: true,
alignmentSize: 256
},
// Pipeline caching
pipelineCache: {
enabled: true,
maxSize: 100
}
};// WASM-specific optimizations
const wasmOptimizations = {
// Memory configuration
memory: {
initial: 256, // 256 pages (16MB)
maximum: 4096, // 4096 pages (256MB)
shared: true
},
// SIMD acceleration
simd: {
enabled: true,
autoVectorize: true
},
// Threading
threading: {
enabled: true,
workers: navigator.hardwareConcurrency || 4
}
};// Dynamic quantization based on available resources
async function selectQuantization(model, availableMemory) {
const modelSize = model.parameters.size;
const quantizationLevels = [
{ level: 'f32', factor: 1.0 },
{ level: 'f16', factor: 0.5 },
{ level: 'q8_0', factor: 0.25 },
{ level: 'q5_k_m', factor: 0.175 },
{ level: 'q4_k_m', factor: 0.14 },
{ level: 'q3_k_m', factor: 0.11 },
{ level: 'q2_k', factor: 0.08 }
];
for (const quant of quantizationLevels) {
const requiredMemory = modelSize * quant.factor;
if (requiredMemory <= availableMemory) {
return quant.level;
}
}
throw new Error('Model too large for available memory');
}// Remove unnecessary model components
class ModelPruner {
async prune(model, options = {}) {
const pruned = { ...model };
// Remove unused layers
if (options.removeUnusedLayers) {
pruned.layers = this.identifyUsedLayers(model);
}
// Reduce vocabulary size
if (options.reduceVocabulary) {
pruned.vocabulary = this.pruneVocabulary(model.vocabulary);
}
// Compress embeddings
if (options.compressEmbeddings) {
pruned.embeddings = await this.compressEmbeddings(model.embeddings);
}
return pruned;
}
}// Compress requests and responses
import zlib from 'zlib';
const compressionMiddleware = (req, res, next) => {
// Compress responses
res.setHeader('Content-Encoding', 'gzip');
const originalSend = res.send;
res.send = function(data) {
zlib.gzip(data, (err, compressed) => {
if (err) {
return originalSend.call(this, data);
}
res.setHeader('Content-Length', compressed.length);
originalSend.call(this, compressed);
});
};
// Decompress requests
if (req.headers['content-encoding'] === 'gzip') {
const chunks = [];
req.on('data', chunk => chunks.push(chunk));
req.on('end', () => {
const buffer = Buffer.concat(chunks);
zlib.gunzip(buffer, (err, decompressed) => {
if (!err) {
req.body = JSON.parse(decompressed.toString());
}
next();
});
});
} else {
next();
}
};// HTTP/2 connection pooling
import http2 from 'http2';
class ConnectionPool {
constructor(maxConnections = 10) {
this.connections = new Map();
this.maxConnections = maxConnections;
}
getConnection(origin) {
if (!this.connections.has(origin)) {
const session = http2.connect(origin, {
maxSessionMemory: 10,
settings: {
enablePush: false,
initialWindowSize: 1024 * 1024
}
});
this.connections.set(origin, session);
// Clean up on error
session.on('error', () => {
this.connections.delete(origin);
});
}
return this.connections.get(origin);
}
}// production.config.js
export default {
// Process configuration
cluster: {
workers: 'auto', // Use all CPU cores
restart: true,
maxMemory: '4G'
},
// Model configuration
models: {
preload: ['llama-7b-q4'],
maxLoaded: 5,
unloadTimeout: 300000
},
// Cache configuration
cache: {
type: 'redis',
maxSize: '10GB',
ttl: 7200,
compression: true
},
// Performance settings
performance: {
maxConcurrent: 100,
queueSize: 1000,
timeout: 30000,
batchSize: 32
},
// Monitoring
monitoring: {
metrics: true,
tracing: true,
profiling: false // Enable only when debugging
}
};// Health check endpoint
app.get('/health', async (req, res) => {
const health = {
status: 'healthy',
timestamp: new Date(),
uptime: process.uptime(),
memory: process.memoryUsage(),
models: router.getLoadedModels(),
metrics: router.getMetrics()
};
// Check critical metrics
if (health.memory.heapUsed > 0.9 * health.memory.heapTotal) {
health.status = 'degraded';
health.warnings = ['High memory usage'];
}
if (health.metrics.errorRate > 0.05) {
health.status = 'unhealthy';
health.errors = ['High error rate'];
}
res.status(health.status === 'healthy' ? 200 : 503);
res.json(health);
});Performance is not achieved, it's crafted. Every millisecond saved is a victory! 🚀
Built with 💙 by Echo AI Systems