Example input
[Detailed error message and symptoms]: Node.js API returning 503 intermittently under high load
[Relevant code snippets and surrounding context]: Express.js REST endpoint with MongoDB connection
[Runtime environment, versions, and configuration]: Production, Node.js 14.x, MongoDB 4.2
[Issue severity and scope - Optional]: High - affecting 30% of users
Example output
Let me help diagnose and resolve this intermittent 503 issue using a systematic approach.
A. Problem Analysis
The 503 errors during high load combined with MongoDB suggests potential connection pool exhaustion or resource constraints. The intermittent nature points to a scaling or resource management issue rather than a straightforward code bug.
Root Cause Indicators:
1. Intermittent failures specifically under load
2. Service unavailability (503) rather than application errors
3. High impact rate affecting 30% of users
B. Investigation Steps
Let's create a diagnostic script to help identify the exact bottleneck:
```javascript
const mongoose = require('mongoose');
const promClient = require('prom-client');
// Metrics
const activeConnections = new promClient.Gauge({
name: 'mongodb_active_connections',
help: 'Current number of active MongoDB connections'
});
const connectionWaitTime = new promClient.Histogram({
name: 'mongodb_connection_wait_ms',
help: 'MongoDB connection wait time in milliseconds',
buckets: [10, 50, 100, 200, 500, 1000, 2000]
});
// Connection monitoring
function monitorConnections() {
const conn = mongoose.connection;
// Monitor pool metrics
setInterval(() => {
if (conn.db) {
conn.db.admin().serverStatus((err, status) => {
if (!err) {
activeConnections.set(status.connections.current);
console.log({
activeConnections: status.connections.current,
availableConnections: status.connections.available,
maxConnections: status.connections.totalCreated
});
}
});
}
}, 5000);
// Monitor connection events
conn.on('connected', () => {
console.log('MongoDB connected');
});
conn.on('disconnected', () => {
console.log('MongoDB disconnected');
});
conn.on('reconnected', () => {
console.log('MongoDB reconnected');
});
conn.on('timeout', () => {
console.log('MongoDB connection timeout');
});
}
// Connection wrapper with timing
async function withTimedConnection(operation) {
const startTime = Date.now();
try {
const result = await operation();
connectionWaitTime.observe(Date.now() - startTime);
return result;
} catch (error) {
throw error;
}
}
module.exports = {
monitorConnections,
withTimedConnection,
metrics: {
activeConnections,
connectionWaitTime
}
};
```
C. Solution Recommendations
1. Immediate Mitigation:
```javascript
// Update MongoDB connection configuration
mongoose.connect(MONGODB_URI, {
poolSize: 50, // Increase from default 5
serverSelectionTimeoutMS: 5000,
socketTimeoutMS: 45000,
maxPoolSize: 50,
minPoolSize: 10,
maxIdleTimeMS: 10000
});
```
2. Long-term Solutions:
- Implement connection pooling optimization
- Add circuit breaker pattern
- Set up request queuing
- Implement caching layer
- Consider read replicas for scaling
D. Implementation Guide
1. Deploy the diagnostic script to monitor connection patterns
2. Update MongoDB connection configuration
3. Implement the following improvements:
```javascript
const express = require('express');
const mongoose = require('mongoose');
const Redis = require('ioredis');
const { CircuitBreaker } = require('opossum');
// Redis client for caching
const redis = new Redis({
maxRetriesPerRequest: 3,
enableReadyCheck: true
});
// Circuit breaker configuration
const breaker = new CircuitBreaker(async (operation) => {
return await operation();
}, {
timeout: 3000,
errorThresholdPercentage: 50,
resetTimeout: 30000
});
// Request queue implementation
class RequestQueue {
constructor(maxConcurrent = 50) {
this.maxConcurrent = maxConcurrent;
this.running = 0;
this.queue = [];
}
async add(handler) {
if (this.running < this.maxConcurrent) {
this.running++;
try {
return await handler();
} finally {
this.running--;
this.processQueue();
}
}
return new Promise((resolve, reject) => {
this.queue.push({ handler, resolve, reject });
});
}
async processQueue() {
if (this.queue.length === 0 || this.running >= this.maxConcurrent) return;
const { handler, resolve, reject } = this.queue.shift();
this.running++;
try {
const result = await handler();
resolve(result);
} catch (error) {
reject(error);
} finally {
this.running--;
this.processQueue();
}
}
}
const requestQueue = new RequestQueue(50);
// Example API endpoint with optimizations
app.get('/api/data', async (req, res) => {
try {
await requestQueue.add(async () => {
// Check cache first
const cacheKey = `data:${req.query.id}`;
const cachedData = await redis.get(cacheKey);
if (cachedData) {
return res.json(JSON.parse(cachedData));
}
// If not in cache, fetch from MongoDB using circuit breaker
const data = await breaker.fire(async () => {
const result = await YourModel.findById(req.query.id);
// Cache the result
await redis.set(cacheKey, JSON.stringify(result), 'EX', 300);
return result;
});
res.json(data);
});
} catch (error) {
console.error('Request failed:', error);
res.status(503).json({ error: 'Service temporarily unavailable' });
}
});
// Health check endpoint
app.get('/health', (req, res) => {
const health = {
uptime: process.uptime(),
mongoStatus: mongoose.connection.readyState === 1 ? 'connected' : 'disconnected',
responseTime: process.hrtime(),
breakerStatus: breaker.stats,
queueStatus: {
running: requestQueue.running,
queued: requestQueue.queue.length
}
};
res.json(health);
});
```
Validation Steps:
1. Deploy diagnostic script
2. Monitor connection metrics
3. Gradually roll out optimizations
4. Watch error rates and response times
5. Monitor cache hit rates
6. Track circuit breaker status
Would you like me to provide more detailed information about any of these components or help implement a specific part of the solution?