Spider Integration Patterns

Overview

Patterns and best practices for integrating spider_rs into the Web Intelligence MCP server.

Core Principle: Async First

spider_rs is an async-native library. All operations should use await:

from spider_rs import crawl

# Always await the crawl function
result = await crawl(url)

Pattern 1: Single URL Fetch

Simplest pattern for scraping one page:

@mcp.tool()
async def web_scrape(url: str) -> str:
    """Scrape a single URL."""
    from spider_rs import crawl

    result = await crawl(url)

    if result.pages:
        page = result.pages[0]
        return json.dumps({
            'url': url,
            'content_length': len(page.content) if hasattr(page, 'content') else 0,
            'links_found': len(result.links)
        })
    return json.dumps({'error': 'No content retrieved'})

Pattern 2: Parallel Multi-Site

Crawl multiple sites simultaneously for 2x+ speedup:

@mcp.tool()
async def web_crawl_batch(urls: List[str]) -> str:
    """Crawl multiple URLs in parallel."""
    from spider_rs import crawl
    import asyncio

    results = await asyncio.gather(
        *[crawl(url) for url in urls],
        return_exceptions=True
    )

    output = []
    for url, result in zip(urls, results):
        if isinstance(result, Exception):
            output.append({'url': url, 'error': str(result)})
        else:
            output.append({
                'url': url,
                'pages': len(result.pages),
                'links': len(result.links)
            })

    return json.dumps(output)

Pattern 3: Link Discovery

Extract all links from a page:

@mcp.tool()
async def web_discover_links(url: str) -> str:
    """Discover all links on a page."""
    from spider_rs import crawl

    result = await crawl(url)

    # Filter and categorize links
    internal = [l for l in result.links if url.split('/')[2] in l]
    external = [l for l in result.links if url.split('/')[2] not in l]

    return json.dumps({
        'url': url,
        'total_links': len(result.links),
        'internal': internal[:20],  # Limit for response size
        'external': external[:20]
    })

Pattern 4: Timeout Handling

Prevent long-running crawls from blocking:

@mcp.tool()
async def web_scrape_safe(url: str, timeout: int = 30) -> str:
    """Scrape with timeout protection."""
    from spider_rs import crawl
    import asyncio

    try:
        result = await asyncio.wait_for(
            crawl(url),
            timeout=timeout
        )
        return json.dumps({'success': True, 'pages': len(result.pages)})
    except asyncio.TimeoutError:
        return json.dumps({'error': f'Timeout after {timeout}s'})
    except Exception as e:
        return json.dumps({'error': str(e)})

Pattern 5: Rate-Limited Batch

Respect site rate limits with controlled concurrency:

@mcp.tool()
async def web_crawl_polite(
    urls: List[str],
    max_concurrent: int = 3,
    delay_between: float = 0.5
) -> str:
    """Crawl with rate limiting."""
    from spider_rs import crawl
    import asyncio

    semaphore = asyncio.Semaphore(max_concurrent)

    async def crawl_one(url):
        async with semaphore:
            result = await crawl(url)
            await asyncio.sleep(delay_between)
            return url, result

    results = await asyncio.gather(
        *[crawl_one(url) for url in urls],
        return_exceptions=True
    )

    return json.dumps([{
        'url': url,
        'pages': len(r.pages) if not isinstance(r, Exception) else 0
    } for url, r in results])

Error Handling Best Practices

Always use try/except - Network operations can fail
Set timeouts - Prevent infinite hangs
Use return_exceptions=True - Don't let one failure kill the batch
Log errors - Store failures for debugging

try:
    result = await asyncio.wait_for(crawl(url), timeout=30)
except asyncio.TimeoutError:
    logger.warning(f'Timeout crawling {url}')
    return {'error': 'timeout'}
except Exception as e:
    logger.error(f'Error crawling {url}: {e}')
    return {'error': str(e)}

Performance Tips

Use asyncio.gather for parallel operations
Limit concurrency to avoid overwhelming targets
Cache results in Redis for repeated lookups
Filter early - Don't process links you don't need