Best Practices, Error Handling, and Optimization

In this final part, we’ll cover essential best practices for building robust, scalable, and maintainable web scraping applications. You’ll learn advanced error handling techniques, performance optimization strategies, security considerations, and monitoring approaches.

Comprehensive Error Handling

Error Classification System

lib/errors/scraping-errors.ts
export enum ErrorType {
NETWORK = 'NETWORK',
AUTHENTICATION = 'AUTHENTICATION',
RATE_LIMIT = 'RATE_LIMIT',
PARSING = 'PARSING',
VALIDATION = 'VALIDATION',
TIMEOUT = 'TIMEOUT',
QUOTA_EXCEEDED = 'QUOTA_EXCEEDED',
SITE_BLOCKED = 'SITE_BLOCKED'
}
export class ScrapingError extends Error {
constructor(
message: string,
public type: ErrorType,
public url: string,
public statusCode?: number,
public retryable: boolean = true,
public retryAfter?: number
) {
super(message);
this.name = 'ScrapingError';
}
toJSON() {
return {
name: this.name,
message: this.message,
type: this.type,
url: this.url,
statusCode: this.statusCode,
retryable: this.retryable,
retryAfter: this.retryAfter,
stack: this.stack
};
}
}
export class ErrorHandler {
static handleFireCrawlError(error: any, url: string): ScrapingError {
if (error.response?.status === 429) {
return new ScrapingError(
'Rate limit exceeded',
ErrorType.RATE_LIMIT,
url,
429,
true,
error.response.headers['retry-after']
);
}
if (error.response?.status === 401) {
return new ScrapingError(
'Authentication failed',
ErrorType.AUTHENTICATION,
url,
401,
false
);
}
if (error.code === 'ENOTFOUND' || error.code === 'ECONNREFUSED') {
return new ScrapingError(
'Network connection failed',
ErrorType.NETWORK,
url,
undefined,
true
);
}
if (error.code === 'ETIMEDOUT') {
return new ScrapingError(
'Request timeout',
ErrorType.TIMEOUT,
url,
undefined,
true
);
}
return new ScrapingError(
error.message || 'Unknown scraping error',
ErrorType.PARSING,
url,
error.response?.status,
true
);
}
}

Retry Strategy Implementation

lib/retry/retry-strategy.ts
interface RetryConfig {
maxAttempts: number;
baseDelay: number;
maxDelay: number;
backoffMultiplier: number;
jitter: boolean;
}
export class RetryStrategy {
private config: RetryConfig;
constructor(config: Partial<RetryConfig> = {}) {
this.config = {
maxAttempts: 3,
baseDelay: 1000,
maxDelay: 30000,
backoffMultiplier: 2,
jitter: true,
...config
};
}
async execute<T>(
operation: () => Promise<T>,
shouldRetry: (error: Error) => boolean = () => true
): Promise<T> {
let lastError: Error;
for (let attempt = 1; attempt <= this.config.maxAttempts; attempt++) {
try {
return await operation();
} catch (error) {
lastError = error as Error;
if (error instanceof ScrapingError && !error.retryable) {
throw error;
}
if (!shouldRetry(error as Error) || attempt === this.config.maxAttempts) {
break;
}
const delay = this.calculateDelay(attempt, error as ScrapingError);
console.log(`⏳ Attempt ${attempt} failed, retrying in ${delay}ms...`);
await this.sleep(delay);
}
}
throw lastError!;
}
private calculateDelay(attempt: number, error?: ScrapingError): number {
if (error?.retryAfter) {
return error.retryAfter * 1000;
}
let delay = this.config.baseDelay * Math.pow(this.config.backoffMultiplier, attempt - 1);
delay = Math.min(delay, this.config.maxDelay);
if (this.config.jitter) {
delay = delay * (0.5 + Math.random() * 0.5);
}
return Math.round(delay);
}
private sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
// Usage example
export const defaultRetryStrategy = new RetryStrategy({
maxAttempts: 3,
baseDelay: 2000,
maxDelay: 60000,
backoffMultiplier: 2,
jitter: true
});

Performance Optimization

Intelligent Caching System

lib/cache/smart-cache.ts
interface CacheStrategy {
ttl: number;
maxSize: number;
evictionPolicy: 'LRU' | 'LFU' | 'TTL';
}
export class SmartCache {
private cache = new Map<string, CacheEntry>();
private accessCount = new Map<string, number>();
private strategy: CacheStrategy;
constructor(strategy: CacheStrategy) {
this.strategy = strategy;
}
async get<T>(key: string): Promise<T | null> {
const entry = this.cache.get(key);
if (!entry) return null;
if (this.isExpired(entry)) {
this.delete(key);
return null;
}
// Update access statistics
this.accessCount.set(key, (this.accessCount.get(key) || 0) + 1);
entry.lastAccessed = Date.now();
return entry.data as T;
}
async set<T>(key: string, data: T, customTtl?: number): Promise<void> {
const ttl = customTtl || this.strategy.ttl;
const entry: CacheEntry = {
data,
createdAt: Date.now(),
lastAccessed: Date.now(),
expiresAt: Date.now() + ttl
};
// Evict if cache is full
if (this.cache.size >= this.strategy.maxSize) {
this.evict();
}
this.cache.set(key, entry);
this.accessCount.set(key, 1);
}
private evict(): void {
let keyToEvict: string;
switch (this.strategy.evictionPolicy) {
case 'LRU':
keyToEvict = this.findLRUKey();
break;
case 'LFU':
keyToEvict = this.findLFUKey();
break;
case 'TTL':
keyToEvict = this.findOldestKey();
break;
}
this.delete(keyToEvict);
}
private findLRUKey(): string {
let oldestTime = Date.now();
let lruKey = '';
for (const [key, entry] of this.cache.entries()) {
if (entry.lastAccessed < oldestTime) {
oldestTime = entry.lastAccessed;
lruKey = key;
}
}
return lruKey;
}
private findLFUKey(): string {
let minCount = Infinity;
let lfuKey = '';
for (const [key, count] of this.accessCount.entries()) {
if (count < minCount) {
minCount = count;
lfuKey = key;
}
}
return lfuKey;
}
private findOldestKey(): string {
let oldestTime = Date.now();
let oldestKey = '';
for (const [key, entry] of this.cache.entries()) {
if (entry.createdAt < oldestTime) {
oldestTime = entry.createdAt;
oldestKey = key;
}
}
return oldestKey;
}
private isExpired(entry: CacheEntry): boolean {
return Date.now() > entry.expiresAt;
}
private delete(key: string): void {
this.cache.delete(key);
this.accessCount.delete(key);
}
getStats() {
return {
size: this.cache.size,
maxSize: this.strategy.maxSize,
hitRate: this.calculateHitRate(),
evictionPolicy: this.strategy.evictionPolicy
};
}
private calculateHitRate(): number {
const totalAccess = Array.from(this.accessCount.values()).reduce((sum, count) => sum + count, 0);
return totalAccess > 0 ? (this.cache.size / totalAccess) * 100 : 0;
}
}
interface CacheEntry {
data: any;
createdAt: number;
lastAccessed: number;
expiresAt: number;
}

Request Batching and Pooling

lib/optimization/request-pool.ts
export class RequestPool {
private activeRequests = new Map<string, Promise<any>>();
private requestQueue: QueuedRequest[] = [];
private processing = false;
private maxConcurrent: number;
private requestDelay: number;
constructor(maxConcurrent: number = 5, requestDelay: number = 1000) {
this.maxConcurrent = maxConcurrent;
this.requestDelay = requestDelay;
}
async addRequest<T>(
key: string,
requestFn: () => Promise<T>,
priority: number = 0
): Promise<T> {
// Check if request is already in progress
const existingRequest = this.activeRequests.get(key);
if (existingRequest) {
return existingRequest as Promise<T>;
}
return new Promise((resolve, reject) => {
this.requestQueue.push({
key,
requestFn,
priority,
resolve,
reject
});
this.requestQueue.sort((a, b) => b.priority - a.priority);
this.processQueue();
});
}
private async processQueue(): Promise<void> {
if (this.processing || this.requestQueue.length === 0) {
return;
}
this.processing = true;
while (this.requestQueue.length > 0 && this.activeRequests.size < this.maxConcurrent) {
const request = this.requestQueue.shift()!;
this.executeRequest(request);
// Add delay between requests
if (this.requestQueue.length > 0) {
await new Promise(resolve => setTimeout(resolve, this.requestDelay));
}
}
this.processing = false;
// Continue processing if there are more requests
if (this.requestQueue.length > 0) {
setTimeout(() => this.processQueue(), this.requestDelay);
}
}
private async executeRequest(request: QueuedRequest): Promise<void> {
const { key, requestFn, resolve, reject } = request;
const promise = requestFn()
.then(result => {
resolve(result);
return result;
})
.catch(error => {
reject(error);
throw error;
})
.finally(() => {
this.activeRequests.delete(key);
this.processQueue();
});
this.activeRequests.set(key, promise);
}
getStats() {
return {
activeRequests: this.activeRequests.size,
queuedRequests: this.requestQueue.length,
maxConcurrent: this.maxConcurrent
};
}
}
interface QueuedRequest {
key: string;
requestFn: () => Promise<any>;
priority: number;
resolve: (value: any) => void;
reject: (error: any) => void;
}
export const globalRequestPool = new RequestPool(10, 500);

Security Best Practices

Input Validation and Sanitization

lib/security/validators.ts
import validator from 'validator';
export class SecurityValidator {
static validateUrl(url: string): { valid: boolean; error?: string } {
if (!url || typeof url !== 'string') {
return { valid: false, error: 'URL is required and must be a string' };
}
if (!validator.isURL(url, {
protocols: ['http', 'https'],
require_protocol: true
})) {
return { valid: false, error: 'Invalid URL format' };
}
// Check for suspicious patterns
const suspiciousPatterns = [
/localhost/i,
/127\.0\.0\.1/,
/0\.0\.0\.0/,
/192\.168\./,
/10\./,
/172\.(1[6-9]|2[0-9]|3[0-1])\./,
/file:\/\//i,
/ftp:\/\//i
];
for (const pattern of suspiciousPatterns) {
if (pattern.test(url)) {
return { valid: false, error: 'URL not allowed for security reasons' };
}
}
return { valid: true };
}
static sanitizeExtractionPrompt(prompt: string): string {
if (!prompt || typeof prompt !== 'string') {
return '';
}
// Remove potentially dangerous content
const cleaned = prompt
.replace(/<script[^>]*>.*?<\/script>/gi, '')
.replace(/<iframe[^>]*>.*?<\/iframe>/gi, '')
.replace(/javascript:/gi, '')
.replace(/on\w+\s*=/gi, '')
.trim();
// Limit length
return cleaned.substring(0, 2000);
}
static validateApiKey(apiKey: string): boolean {
if (!apiKey || typeof apiKey !== 'string') {
return false;
}
// Basic format validation
return /^[A-Za-z0-9_-]{20,}$/.test(apiKey);
}
static rateLimitKey(req: any): string {
// Use multiple identifiers for rate limiting
const ip = req.ip || req.connection.remoteAddress;
const userAgent = req.headers['user-agent'] || '';
const apiKey = req.headers.authorization?.replace('Bearer ', '') || '';
return `${ip}:${userAgent.substring(0, 50)}:${apiKey.substring(0, 10)}`;
}
}

Content Security and Filtering

lib/security/content-filter.ts
export class ContentFilter {
private static readonly BLOCKED_DOMAINS = new Set([
'facebook.com',
'instagram.com',
'twitter.com',
'linkedin.com',
'tiktok.com'
]);
private static readonly SENSITIVE_PATTERNS = [
/password/i,
/credit.?card/i,
/ssn|social.?security/i,
/api.?key/i,
/token/i,
/secret/i
];
static isAllowedDomain(url: string): boolean {
try {
const domain = new URL(url).hostname.toLowerCase();
return !this.BLOCKED_DOMAINS.has(domain);
} catch {
return false;
}
}
static filterSensitiveContent(content: string): string {
let filtered = content;
for (const pattern of this.SENSITIVE_PATTERNS) {
filtered = filtered.replace(pattern, '[REDACTED]');
}
return filtered;
}
static detectPersonalInfo(content: string): string[] {
const findings: string[] = [];
// Email detection
const emailPattern = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
if (emailPattern.test(content)) {
findings.push('email_addresses');
}
// Phone number detection
const phonePattern = /(\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g;
if (phonePattern.test(content)) {
findings.push('phone_numbers');
}
// Credit card detection (basic)
const ccPattern = /\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b/g;
if (ccPattern.test(content)) {
findings.push('potential_credit_cards');
}
return findings;
}
}

Monitoring and Observability

Comprehensive Metrics Collection

lib/monitoring/metrics.ts
export interface ScrapingMetrics {
requestCount: number;
successCount: number;
errorCount: number;
averageResponseTime: number;
totalCreditsUsed: number;
activeJobs: number;
queueSize: number;
}
export class MetricsCollector {
private metrics: Map<string, any> = new Map();
private timers: Map<string, number> = new Map();
startTimer(key: string): void {
this.timers.set(key, Date.now());
}
endTimer(key: string): number {
const startTime = this.timers.get(key);
if (!startTime) return 0;
const duration = Date.now() - startTime;
this.timers.delete(key);
this.recordMetric(`${key}_duration`, duration);
return duration;
}
recordMetric(key: string, value: number): void {
const existing = this.metrics.get(key) || { count: 0, sum: 0, min: Infinity, max: -Infinity };
existing.count++;
existing.sum += value;
existing.min = Math.min(existing.min, value);
existing.max = Math.max(existing.max, value);
existing.avg = existing.sum / existing.count;
existing.last = value;
existing.timestamp = Date.now();
this.metrics.set(key, existing);
}
incrementCounter(key: string, value: number = 1): void {
const existing = this.metrics.get(key) || { count: 0 };
existing.count += value;
existing.timestamp = Date.now();
this.metrics.set(key, existing);
}
getMetric(key: string): any {
return this.metrics.get(key);
}
getAllMetrics(): Record<string, any> {
const result: Record<string, any> = {};
for (const [key, value] of this.metrics.entries()) {
result[key] = value;
}
return result;
}
getScrapingMetrics(): ScrapingMetrics {
const requests = this.getMetric('requests') || { count: 0 };
const successes = this.getMetric('successes') || { count: 0 };
const errors = this.getMetric('errors') || { count: 0 };
const responseTime = this.getMetric('response_time_duration') || { avg: 0 };
const credits = this.getMetric('credits_used') || { sum: 0 };
const activeJobs = this.getMetric('active_jobs') || { count: 0 };
const queueSize = this.getMetric('queue_size') || { count: 0 };
return {
requestCount: requests.count,
successCount: successes.count,
errorCount: errors.count,
averageResponseTime: responseTime.avg,
totalCreditsUsed: credits.sum,
activeJobs: activeJobs.count,
queueSize: queueSize.count
};
}
reset(): void {
this.metrics.clear();
this.timers.clear();
}
}
export const globalMetrics = new MetricsCollector();

Health Check System

lib/monitoring/health-check.ts
export interface HealthStatus {
status: 'healthy' | 'degraded' | 'unhealthy';
timestamp: string;
services: Record<string, ServiceHealth>;
metrics: {
uptime: number;
memoryUsage: number;
cpuUsage: number;
};
}
export interface ServiceHealth {
status: 'up' | 'down' | 'degraded';
responseTime?: number;
error?: string;
lastCheck: string;
}
export class HealthChecker {
private services: Map<string, () => Promise<ServiceHealth>> = new Map();
registerService(name: string, checker: () => Promise<ServiceHealth>): void {
this.services.set(name, checker);
}
async checkHealth(): Promise<HealthStatus> {
const serviceResults: Record<string, ServiceHealth> = {};
let overallStatus: 'healthy' | 'degraded' | 'unhealthy' = 'healthy';
// Check all registered services
for (const [name, checker] of this.services.entries()) {
try {
serviceResults[name] = await checker();
if (serviceResults[name].status === 'down') {
overallStatus = 'unhealthy';
} else if (serviceResults[name].status === 'degraded' && overallStatus === 'healthy') {
overallStatus = 'degraded';
}
} catch (error) {
serviceResults[name] = {
status: 'down',
error: error instanceof Error ? error.message : 'Unknown error',
lastCheck: new Date().toISOString()
};
overallStatus = 'unhealthy';
}
}
return {
status: overallStatus,
timestamp: new Date().toISOString(),
services: serviceResults,
metrics: {
uptime: process.uptime(),
memoryUsage: process.memoryUsage().heapUsed / 1024 / 1024, // MB
cpuUsage: process.cpuUsage().user / 1000000 // Convert to seconds
}
};
}
}
// Health check implementations
export const healthChecker = new HealthChecker();
// Register FireCrawl service check
healthChecker.registerService('firecrawl', async (): Promise<ServiceHealth> => {
const startTime = Date.now();
try {
// Simple ping to FireCrawl API
const result = await firecrawlApp.scrapeUrl('https://httpbin.org/status/200', {
formats: ['markdown'],
timeout: 5000
});
return {
status: result.success ? 'up' : 'degraded',
responseTime: Date.now() - startTime,
lastCheck: new Date().toISOString()
};
} catch (error) {
return {
status: 'down',
error: error instanceof Error ? error.message : 'Unknown error',
responseTime: Date.now() - startTime,
lastCheck: new Date().toISOString()
};
}
});
// Register database check
healthChecker.registerService('database', async (): Promise<ServiceHealth> => {
const startTime = Date.now();
try {
// Simple database query
// await db.query('SELECT 1');
return {
status: 'up',
responseTime: Date.now() - startTime,
lastCheck: new Date().toISOString()
};
} catch (error) {
return {
status: 'down',
error: error instanceof Error ? error.message : 'Database connection failed',
responseTime: Date.now() - startTime,
lastCheck: new Date().toISOString()
};
}
});

Production Deployment Checklist

Essential Configuration

config/production.ts
export const PRODUCTION_CONFIG = {
// API Configuration
api: {
timeout: 30000,
retryAttempts: 3,
rateLimitPerHour: 1000,
maxConcurrentRequests: 10
},
// Cache Configuration
cache: {
ttl: 3600000, // 1 hour
maxSize: 1000,
evictionPolicy: 'LRU' as const
},
// Security Configuration
security: {
maxUrlLength: 2048,
maxPromptLength: 2000,
allowedDomains: [], // Empty = all allowed
blockedDomains: ['localhost', '127.0.0.1'],
requireHttps: true
},
// Monitoring Configuration
monitoring: {
metricsRetention: 86400000, // 24 hours
healthCheckInterval: 30000, // 30 seconds
alertThresholds: {
errorRate: 0.05, // 5%
responseTime: 10000, // 10 seconds
queueSize: 100
}
}
};

Key Takeaways

  • Implement comprehensive error handling with proper classification and retry strategies
  • Use intelligent caching and request pooling for optimal performance
  • Apply security best practices including input validation and content filtering
  • Monitor your application with detailed metrics and health checks
  • Follow production deployment best practices for reliability and scalability

Series Conclusion

Congratulations! You’ve completed the comprehensive “Web Scraping with FireCrawl and Vercel SDK” series. You now have the knowledge and tools to build production-ready web scraping applications that are:

  • Scalable: Handle high volumes with proper architecture
  • Reliable: Robust error handling and monitoring
  • Secure: Protected against common vulnerabilities
  • Maintainable: Well-structured and documented code
  • Cost-effective: Optimized for performance and resource usage

Continue building amazing scraping solutions and remember to always respect website terms of service and robots.txt files!

Share Feedback