Basic Web Scraping with FireCrawl API
Now that you have FireCrawl set up, let’s dive into the core API functionality. In this part, we’ll explore different scraping methods, data extraction techniques, and how to handle various content formats effectively.
Understanding FireCrawl’s API Methods
FireCrawl provides several key methods for different scraping scenarios:
1. Single Page Scraping (scrapeUrl)
Perfect for extracting data from individual pages.
2. Website Crawling (crawlUrl)
Automatically discovers and scrapes multiple pages from a website.
3. Batch Processing
Handle multiple URLs efficiently with proper rate limiting.
Let’s explore each method with practical examples.
Single Page Scraping Deep Dive
Basic Scraping with Different Formats
import { firecrawlApp } from '../config/firecrawl';
export class FormatScraper { async scrapeWithMarkdown(url: string) { const result = await firecrawlApp.scrapeUrl(url, { formats: ['markdown'], onlyMainContent: true, });
if (result.success) { return { title: result.data.metadata?.title, content: result.data.markdown, wordCount: result.data.markdown?.split(' ').length || 0, }; } return null; }
async scrapeWithHtml(url: string) { const result = await firecrawlApp.scrapeUrl(url, { formats: ['html'], includeTags: ['h1', 'h2', 'h3', 'p', 'a', 'img', 'div'], excludeTags: ['script', 'style', 'nav', 'footer', 'aside'], });
if (result.success) { return { title: result.data.metadata?.title, html: result.data.html, links: this.extractLinks(result.data.html || ''), images: this.extractImages(result.data.html || ''), }; } return null; }
async scrapeWithStructuredData(url: string) { const result = await firecrawlApp.scrapeUrl(url, { formats: ['markdown', 'html'], extractorOptions: { mode: 'llm-extraction', extractionPrompt: `Extract the following information: - Main heading - All subheadings - Key points or bullet points - Any contact information - Publication date if available Return as structured JSON.` } });
return result.success ? result.data : null; }
private extractLinks(html: string): string[] { const linkRegex = /<a[^>]+href=["']([^"']+)["'][^>]*>/gi; const links: string[] = []; let match;
while ((match = linkRegex.exec(html)) !== null) { links.push(match[1]); }
return [...new Set(links)]; // Remove duplicates }
private extractImages(html: string): string[] { const imgRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi; const images: string[] = []; let match;
while ((match = imgRegex.exec(html)) !== null) { images.push(match[1]); }
return [...new Set(images)]; }}Advanced Data Extraction Techniques
Custom Data Extractors
import { firecrawlApp } from '../config/firecrawl';
export interface ProductData { name: string; price: string; description: string; images: string[]; availability: string; rating?: number; reviews?: number;}
export interface ArticleData { title: string; author: string; publishDate: string; content: string; tags: string[]; readingTime: number;}
export class DataExtractor { async extractProductInfo(url: string): Promise<ProductData | null> { const result = await firecrawlApp.scrapeUrl(url, { formats: ['markdown'], extractorOptions: { mode: 'llm-extraction', extractionPrompt: `Extract product information from this e-commerce page: - Product name - Price (include currency) - Product description - Image URLs - Availability status - Rating (if available) - Number of reviews (if available)
Return as JSON with keys: name, price, description, images, availability, rating, reviews` } });
if (!result.success) return null;
try { // Parse the extracted data const extractedData = result.data.extract; return this.validateProductData(extractedData); } catch (error) { console.error('Error parsing product data:', error); return null; } }
async extractArticleInfo(url: string): Promise<ArticleData | null> { const result = await firecrawlApp.scrapeUrl(url, { formats: ['markdown'], extractorOptions: { mode: 'llm-extraction', extractionPrompt: `Extract article information: - Title - Author name - Publication date - Full article content - Tags or categories - Estimated reading time in minutes
Return as JSON with keys: title, author, publishDate, content, tags, readingTime` } });
if (!result.success) return null;
try { const extractedData = result.data.extract; return this.validateArticleData(extractedData); } catch (error) { console.error('Error parsing article data:', error); return null; } }
private validateProductData(data: any): ProductData | null { if (!data || typeof data !== 'object') return null;
return { name: data.name || 'Unknown Product', price: data.price || 'Price not available', description: data.description || '', images: Array.isArray(data.images) ? data.images : [], availability: data.availability || 'Unknown', rating: typeof data.rating === 'number' ? data.rating : undefined, reviews: typeof data.reviews === 'number' ? data.reviews : undefined, }; }
private validateArticleData(data: any): ArticleData | null { if (!data || typeof data !== 'object') return null;
return { title: data.title || 'Untitled', author: data.author || 'Unknown Author', publishDate: data.publishDate || '', content: data.content || '', tags: Array.isArray(data.tags) ? data.tags : [], readingTime: typeof data.readingTime === 'number' ? data.readingTime : 0, }; }}Website Crawling Fundamentals
Basic Website Crawling
import { firecrawlApp } from '../config/firecrawl';import { saveToFile } from '../utils/helpers';
export interface CrawlResult { url: string; title: string; content: string; links: string[]; crawledAt: string; depth: number;}
export class WebsiteCrawler { async crawlWebsite( startUrl: string, options: { maxPages?: number; allowSubdomains?: boolean; excludePatterns?: string[]; includePatterns?: string[]; } = {} ): Promise<CrawlResult[]> { const { maxPages = 10, allowSubdomains = false, excludePatterns = [], includePatterns = [] } = options;
console.log(`🕷️ Starting crawl of: ${startUrl}`);
const crawlResult = await firecrawlApp.crawlUrl(startUrl, { limit: maxPages, scrapeOptions: { formats: ['markdown'], onlyMainContent: true, }, allowBackwardCrawling: false, allowExternalContentLinks: allowSubdomains, excludePaths: excludePatterns, includePaths: includePatterns, });
if (!crawlResult.success) { console.error('❌ Crawl failed:', crawlResult.error); return []; }
const results: CrawlResult[] = [];
for (const page of crawlResult.data) { if (page.markdown) { results.push({ url: page.metadata?.sourceURL || '', title: page.metadata?.title || 'No title', content: page.markdown, links: this.extractLinksFromMarkdown(page.markdown), crawledAt: new Date().toISOString(), depth: this.calculateDepth(startUrl, page.metadata?.sourceURL || ''), }); } }
console.log(`✅ Crawled ${results.length} pages successfully`); return results; }
async crawlBlogSitemap(baseUrl: string): Promise<CrawlResult[]> { // Common blog patterns const includePatterns = [ '/blog/*', '/posts/*', '/articles/*', '/news/*', ];
const excludePatterns = [ '/admin/*', '/wp-admin/*', '/login', '/register', '*.pdf', '*.jpg', '*.png', ];
return this.crawlWebsite(baseUrl, { maxPages: 50, includePatterns, excludePatterns, }); }
async crawlEcommerceCatalog(baseUrl: string): Promise<CrawlResult[]> { const includePatterns = [ '/products/*', '/catalog/*', '/shop/*', '/category/*', ];
const excludePatterns = [ '/cart', '/checkout', '/account', '/admin', '*.pdf', ];
return this.crawlWebsite(baseUrl, { maxPages: 100, includePatterns, excludePatterns, }); }
private extractLinksFromMarkdown(markdown: string): string[] { const linkRegex = /\[([^\]]+)\]\(([^)]+)\)/g; const links: string[] = []; let match;
while ((match = linkRegex.exec(markdown)) !== null) { links.push(match[2]); }
return [...new Set(links)]; }
private calculateDepth(baseUrl: string, currentUrl: string): number { try { const base = new URL(baseUrl); const current = new URL(currentUrl);
if (base.hostname !== current.hostname) return 0;
const basePath = base.pathname.split('/').filter(Boolean); const currentPath = current.pathname.split('/').filter(Boolean);
return Math.max(0, currentPath.length - basePath.length); } catch { return 0; } }}Handling Different Content Types
News Article Scraper
import { firecrawlApp } from '../config/firecrawl';
export interface NewsArticle { headline: string; summary: string; author: string; publishDate: string; category: string; content: string; url: string; imageUrl?: string; tags: string[];}
export class NewsScraper { async scrapeNewsArticle(url: string): Promise<NewsArticle | null> { const result = await firecrawlApp.scrapeUrl(url, { formats: ['markdown'], extractorOptions: { mode: 'llm-extraction', extractionPrompt: `Extract news article information: - Headline/title - Brief summary (1-2 sentences) - Author name - Publication date - Category/section - Full article content - Main image URL if available - Tags or keywords
Return as JSON with keys: headline, summary, author, publishDate, category, content, imageUrl, tags` } });
if (!result.success) return null;
const extractedData = result.data.extract;
return { headline: extractedData?.headline || 'No headline', summary: extractedData?.summary || '', author: extractedData?.author || 'Unknown', publishDate: extractedData?.publishDate || '', category: extractedData?.category || 'General', content: extractedData?.content || result.data.markdown || '', url, imageUrl: extractedData?.imageUrl, tags: Array.isArray(extractedData?.tags) ? extractedData.tags : [], }; }
async scrapeNewsHomepage(url: string): Promise<NewsArticle[]> { const result = await firecrawlApp.scrapeUrl(url, { formats: ['markdown'], extractorOptions: { mode: 'llm-extraction', extractionPrompt: `Extract all news articles from this homepage: For each article, extract: - Headline - Brief summary - Author (if available) - Publication date - Category - Article URL - Image URL (if available)
Return as JSON array of articles` } });
if (!result.success || !result.data.extract) return [];
const articles = Array.isArray(result.data.extract) ? result.data.extract : [result.data.extract];
return articles.map((article: any) => ({ headline: article.headline || 'No headline', summary: article.summary || '', author: article.author || 'Unknown', publishDate: article.publishDate || '', category: article.category || 'General', content: article.summary || '', // Homepage usually has summaries url: article.url || url, imageUrl: article.imageUrl, tags: Array.isArray(article.tags) ? article.tags : [], })); }}Social Media Content Scraper
import { firecrawlApp } from '../config/firecrawl';
export interface SocialPost { platform: string; author: string; content: string; timestamp: string; likes?: number; shares?: number; comments?: number; hashtags: string[]; mentions: string[]; url: string;}
export class SocialScraper { async scrapeTwitterProfile(profileUrl: string): Promise<SocialPost[]> { const result = await firecrawlApp.scrapeUrl(profileUrl, { formats: ['markdown'], extractorOptions: { mode: 'llm-extraction', extractionPrompt: `Extract recent tweets from this Twitter profile: For each tweet, extract: - Tweet content/text - Author username - Timestamp - Number of likes, retweets, replies (if visible) - Hashtags used - Mentioned users
Return as JSON array of tweets` } });
if (!result.success || !result.data.extract) return [];
const tweets = Array.isArray(result.data.extract) ? result.data.extract : [result.data.extract];
return tweets.map((tweet: any) => ({ platform: 'Twitter', author: tweet.author || 'Unknown', content: tweet.content || '', timestamp: tweet.timestamp || '', likes: tweet.likes, shares: tweet.retweets, comments: tweet.replies, hashtags: this.extractHashtags(tweet.content || ''), mentions: this.extractMentions(tweet.content || ''), url: profileUrl, })); }
async scrapeLinkedInPost(postUrl: string): Promise<SocialPost | null> { const result = await firecrawlApp.scrapeUrl(postUrl, { formats: ['markdown'], extractorOptions: { mode: 'llm-extraction', extractionPrompt: `Extract LinkedIn post information: - Post content/text - Author name and title - Publication timestamp - Number of likes, comments, shares (if visible) - Any hashtags used
Return as JSON` } });
if (!result.success || !result.data.extract) return null;
const post = result.data.extract;
return { platform: 'LinkedIn', author: post.author || 'Unknown', content: post.content || '', timestamp: post.timestamp || '', likes: post.likes, shares: post.shares, comments: post.comments, hashtags: this.extractHashtags(post.content || ''), mentions: this.extractMentions(post.content || ''), url: postUrl, }; }
private extractHashtags(text: string): string[] { const hashtagRegex = /#[\w]+/g; return text.match(hashtagRegex) || []; }
private extractMentions(text: string): string[] { const mentionRegex = /@[\w]+/g; return text.match(mentionRegex) || []; }}Practical Examples and Use Cases
Building a Content Aggregator
import { NewsScraper } from '../scrapers/news-scraper';import { DataExtractor } from '../scrapers/data-extractor';import { saveToFile } from '../utils/helpers';
export class ContentAggregator { private newsScraper = new NewsScraper(); private dataExtractor = new DataExtractor();
async aggregateNewsFromSources(sources: string[]): Promise<void> { const allArticles = [];
for (const source of sources) { console.log(`📰 Scraping news from: ${source}`);
try { const articles = await this.newsScraper.scrapeNewsHomepage(source); allArticles.push(...articles.map(article => ({ ...article, source: new URL(source).hostname, })));
// Respect rate limits await new Promise(resolve => setTimeout(resolve, 2000)); } catch (error) { console.error(`❌ Failed to scrape ${source}:`, error); } }
// Sort by publication date (newest first) allArticles.sort((a, b) => new Date(b.publishDate).getTime() - new Date(a.publishDate).getTime() );
// Save aggregated content await saveToFile(allArticles, `news-aggregation-${Date.now()}`, 'json');
console.log(`✅ Aggregated ${allArticles.length} articles from ${sources.length} sources`); }
async createDailyDigest(): Promise<void> { const sources = [ 'https://techcrunch.com', 'https://www.theverge.com', 'https://arstechnica.com', ];
await this.aggregateNewsFromSources(sources); }}E-commerce Price Monitor
import { DataExtractor } from '../scrapers/data-extractor';import { saveToFile } from '../utils/helpers';
export interface PriceHistory { productUrl: string; productName: string; currentPrice: string; previousPrice?: string; priceChange?: number; timestamp: string; availability: string;}
export class PriceMonitor { private dataExtractor = new DataExtractor(); private priceHistory: Map<string, PriceHistory[]> = new Map();
async monitorProducts(productUrls: string[]): Promise<PriceHistory[]> { const currentPrices: PriceHistory[] = [];
for (const url of productUrls) { console.log(`💰 Checking price for: ${url}`);
try { const productData = await this.dataExtractor.extractProductInfo(url);
if (productData) { const priceHistory: PriceHistory = { productUrl: url, productName: productData.name, currentPrice: productData.price, timestamp: new Date().toISOString(), availability: productData.availability, };
// Check for price changes const history = this.priceHistory.get(url) || []; if (history.length > 0) { const lastPrice = history[history.length - 1]; priceHistory.previousPrice = lastPrice.currentPrice; priceHistory.priceChange = this.calculatePriceChange( lastPrice.currentPrice, productData.price ); }
// Update history history.push(priceHistory); this.priceHistory.set(url, history); currentPrices.push(priceHistory); }
// Rate limiting await new Promise(resolve => setTimeout(resolve, 3000)); } catch (error) { console.error(`❌ Failed to check price for ${url}:`, error); } }
// Save current price check await saveToFile(currentPrices, `price-check-${Date.now()}`, 'json');
return currentPrices; }
private calculatePriceChange(oldPrice: string, newPrice: string): number { const oldValue = this.extractNumericPrice(oldPrice); const newValue = this.extractNumericPrice(newPrice);
if (oldValue && newValue) { return ((newValue - oldValue) / oldValue) * 100; }
return 0; }
private extractNumericPrice(priceString: string): number | null { const match = priceString.match(/[\d,]+\.?\d*/); return match ? parseFloat(match[0].replace(',', '')) : null; }
getPriceAlerts(threshold: number = 10): PriceHistory[] { const alerts: PriceHistory[] = [];
for (const history of this.priceHistory.values()) { const latest = history[history.length - 1]; if (latest.priceChange && Math.abs(latest.priceChange) >= threshold) { alerts.push(latest); } }
return alerts; }}Error Handling and Debugging
Robust Error Handling
export class ScrapingError extends Error { constructor( message: string, public url: string, public statusCode?: number, public retryable: boolean = true ) { super(message); this.name = 'ScrapingError'; }}
export class RateLimitError extends ScrapingError { constructor(url: string, retryAfter?: number) { super(`Rate limit exceeded for ${url}`, url, 429, true); this.name = 'RateLimitError'; this.retryAfter = retryAfter; }
retryAfter?: number;}
export async function withRetry<T>( operation: () => Promise<T>, maxRetries: number = 3, baseDelay: number = 1000): Promise<T> { let lastError: Error;
for (let attempt = 1; attempt <= maxRetries; attempt++) { try { return await operation(); } catch (error) { lastError = error as Error;
if (error instanceof ScrapingError && !error.retryable) { throw error; }
if (attempt === maxRetries) { break; }
const delay = baseDelay * Math.pow(2, attempt - 1); // Exponential backoff console.log(`⏳ Attempt ${attempt} failed, retrying in ${delay}ms...`); await new Promise(resolve => setTimeout(resolve, delay)); } }
throw lastError!;}Testing Your Scrapers
Create a comprehensive test suite:
import { FormatScraper } from '../scrapers/format-scraper';import { DataExtractor } from '../scrapers/data-extractor';import { NewsScraper } from '../scrapers/news-scraper';
export class ScraperTests { private formatScraper = new FormatScraper(); private dataExtractor = new DataExtractor(); private newsScraper = new NewsScraper();
async runAllTests(): Promise<void> { console.log('🧪 Running scraper tests...\n');
await this.testBasicScraping(); await this.testDataExtraction(); await this.testNewsArticle();
console.log('\n✅ All tests completed!'); }
private async testBasicScraping(): Promise<void> { console.log('📄 Testing basic scraping...');
const testUrl = 'https://example.com'; const result = await this.formatScraper.scrapeWithMarkdown(testUrl);
if (result) { console.log(`✅ Successfully scraped: ${result.title}`); console.log(`📊 Word count: ${result.wordCount}`); } else { console.log('❌ Basic scraping test failed'); } }
private async testDataExtraction(): Promise<void> { console.log('🔍 Testing data extraction...');
// Test with a sample article URL const testUrl = 'https://blog.example.com/sample-article'; const result = await this.dataExtractor.extractArticleInfo(testUrl);
if (result) { console.log(`✅ Extracted article: ${result.title}`); console.log(`👤 Author: ${result.author}`); console.log(`📅 Published: ${result.publishDate}`); } else { console.log('❌ Data extraction test failed'); } }
private async testNewsArticle(): Promise<void> { console.log('📰 Testing news scraping...');
const testUrl = 'https://news.example.com/latest'; const result = await this.newsScraper.scrapeNewsArticle(testUrl);
if (result) { console.log(`✅ Scraped news: ${result.headline}`); console.log(`🏷️ Category: ${result.category}`); console.log(`🏷️ Tags: ${result.tags.join(', ')}`); } else { console.log('❌ News scraping test failed'); } }}Performance Optimization Tips
Batch Processing
export class BatchProcessor { async processBatch<T, R>( items: T[], processor: (item: T) => Promise<R>, batchSize: number = 5, delayBetweenBatches: number = 2000 ): Promise<R[]> { const results: R[] = [];
for (let i = 0; i < items.length; i += batchSize) { const batch = items.slice(i, i + batchSize); console.log(`🔄 Processing batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(items.length / batchSize)}`);
const batchPromises = batch.map(processor); const batchResults = await Promise.allSettled(batchPromises);
batchResults.forEach((result, index) => { if (result.status === 'fulfilled') { results.push(result.value); } else { console.error(`❌ Failed to process item ${i + index}:`, result.reason); } });
// Delay between batches if (i + batchSize < items.length) { await new Promise(resolve => setTimeout(resolve, delayBetweenBatches)); } }
return results; }}Key Takeaways
- FireCrawl’s API supports multiple output formats (markdown, HTML, structured data)
- Use LLM extraction for complex data parsing and structured output
- Implement proper error handling and retry logic for production use
- Respect rate limits and add delays between requests
- Test your scrapers with various content types and edge cases
- Use batch processing for handling multiple URLs efficiently
Next Steps
In Part 3, we’ll explore advanced scraping techniques including:
- Handling dynamic content and JavaScript-heavy sites
- Working with pagination and infinite scroll
- Advanced data processing and transformation
- Building custom extraction schemas
- Performance optimization strategies
You now have a solid foundation for basic web scraping with FireCrawl. The techniques covered here will serve as building blocks for more complex scraping scenarios!