Spider Integration Patterns
Overview
Patterns and best practices for integrating spider_rs into the Web Intelligence MCP server.
Core Principle: Async First
spider_rs is an async-native library. All operations should use await:
from spider_rs import crawl
# Always await the crawl function
result = await crawl(url)
Pattern 1: Single URL Fetch
Simplest pattern for scraping one page:
@mcp.tool()
async def web_scrape(url: str) -> str:
"""Scrape a single URL."""
from spider_rs import crawl
result = await crawl(url)
if result.pages:
page = result.pages[0]
return json.dumps({
'url': url,
'content_length': len(page.content) if hasattr(page, 'content') else 0,
'links_found': len(result.links)
})
return json.dumps({'error': 'No content retrieved'})
Pattern 2: Parallel Multi-Site
Crawl multiple sites simultaneously for 2x+ speedup:
@mcp.tool()
async def web_crawl_batch(urls: List[str]) -> str:
"""Crawl multiple URLs in parallel."""
from spider_rs import crawl
import asyncio
results = await asyncio.gather(
*[crawl(url) for url in urls],
return_exceptions=True
)
output = []
for url, result in zip(urls, results):
if isinstance(result, Exception):
output.append({'url': url, 'error': str(result)})
else:
output.append({
'url': url,
'pages': len(result.pages),
'links': len(result.links)
})
return json.dumps(output)
Pattern 3: Link Discovery
Extract all links from a page:
@mcp.tool()
async def web_discover_links(url: str) -> str:
"""Discover all links on a page."""
from spider_rs import crawl
result = await crawl(url)
# Filter and categorize links
internal = [l for l in result.links if url.split('/')[2] in l]
external = [l for l in result.links if url.split('/')[2] not in l]
return json.dumps({
'url': url,
'total_links': len(result.links),
'internal': internal[:20], # Limit for response size
'external': external[:20]
})
Pattern 4: Timeout Handling
Prevent long-running crawls from blocking:
@mcp.tool()
async def web_scrape_safe(url: str, timeout: int = 30) -> str:
"""Scrape with timeout protection."""
from spider_rs import crawl
import asyncio
try:
result = await asyncio.wait_for(
crawl(url),
timeout=timeout
)
return json.dumps({'success': True, 'pages': len(result.pages)})
except asyncio.TimeoutError:
return json.dumps({'error': f'Timeout after {timeout}s'})
except Exception as e:
return json.dumps({'error': str(e)})
Pattern 5: Rate-Limited Batch
Respect site rate limits with controlled concurrency:
@mcp.tool()
async def web_crawl_polite(
urls: List[str],
max_concurrent: int = 3,
delay_between: float = 0.5
) -> str:
"""Crawl with rate limiting."""
from spider_rs import crawl
import asyncio
semaphore = asyncio.Semaphore(max_concurrent)
async def crawl_one(url):
async with semaphore:
result = await crawl(url)
await asyncio.sleep(delay_between)
return url, result
results = await asyncio.gather(
*[crawl_one(url) for url in urls],
return_exceptions=True
)
return json.dumps([{
'url': url,
'pages': len(r.pages) if not isinstance(r, Exception) else 0
} for url, r in results])
Error Handling Best Practices
- Always use try/except - Network operations can fail
- Set timeouts - Prevent infinite hangs
- Use return_exceptions=True - Don't let one failure kill the batch
- Log errors - Store failures for debugging
try:
result = await asyncio.wait_for(crawl(url), timeout=30)
except asyncio.TimeoutError:
logger.warning(f'Timeout crawling {url}')
return {'error': 'timeout'}
except Exception as e:
logger.error(f'Error crawling {url}: {e}')
return {'error': str(e)}
Performance Tips
- Use asyncio.gather for parallel operations
- Limit concurrency to avoid overwhelming targets
- Cache results in Redis for repeated lookups
- Filter early - Don't process links you don't need