Paste ofCode

# building_body_final_nov2025.py
import asyncio
import os
import csv
import re
import hashlib
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.deep_crawling.filters import FilterChain, DomainFilter, URLPatternFilter

# ========================= CONFIG =========================
TARGET_URL = "https://building-body.com/"
CRAWL_DEPTH = 50
OUTPUT_DIR = "crawled_content"
PROCESSED_CSV = "processed_log.csv"

PERSISTENT_PROFILE = True
USER_DATA_DIR = "./browser_profile"

# ========================= COOKIES (update only if expired) =========================
COOKIES = [
    {"name": "PHPSESSID", "value": "Value", "domain": ".building-body.com", "path": "/"},
    {"name": "oaid", "value": "Value", "domain": ".building-body.com", "path": "/"},
    {"name": "wfwaf-authcookie-b8f3ba21d06bf35f497293c6abf7f126",
     "value": "238151%7Cadministrator%7Cmanage_options%2Cunfiltered_html%2Cedit_others_posts%2Cupload_files%2Cpublish_posts%2Cedit_posts%2Cread%7C15c9c3df7c953a9430de001024d2749aca010b766064c5736f621cf4113c29ba",
     "domain": ".building-body.com", "path": "/"}
]

def safe_filename(url: str) -> str:
    clean = re.sub(r'^https?://', '', url.split('?')[0].rstrip('/'))
    safe = re.sub(r'[^a-zA-Z0-9]', '_', clean)[:100]
    hash_part = hashlib.sha1(url.encode()).hexdigest()[:8]
    return f"{safe}_{hash_part}.md"

async def deep_crawl_and_save():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    if PERSISTENT_PROFILE:
        os.makedirs(USER_DATA_DIR, exist_ok=True)

    # ======================= BROWSER CONFIG - MAX STEALTH =======================
    browser_config = BrowserConfig(
        headless=True,
        cookies=COOKIES,
        use_persistent_context=PERSISTENT_PROFILE,
        user_data_dir=USER_DATA_DIR if PERSISTENT_PROFILE else None,
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
        viewport_width=1920,
        viewport_height=1080,
        enable_stealth=True,
        extra_args=["--disable-blink-features=AutomationControlled"],
        verbose=False
    )

    # ======================= CRAWLER CONFIG - SAFE & SLOW =======================
    run_config = CrawlerRunConfig(
        css_selector="h1, .author, .blog-body, title",
        semaphore_count=1,           # 1 tab only
        mean_delay=8.0,              # 8–12s random delay
        max_range=4.0,
        delay_before_return_html=3.0,
        page_timeout=90000,
        magic=True,
        simulate_user=True,

        deep_crawl_strategy=BFSDeepCrawlStrategy(
            max_depth=CRAWL_DEPTH,
            include_external=False,
            filter_chain=FilterChain([
                DomainFilter(allowed_domains=["building-body.com", "www.building-body.com"]),
                URLPatternFilter(
                    patterns=["*login*", "*register*", "*wp-admin*", "*wp-includes*", "*.jpg", "*.png", "*.gif", "*.pdf", "*logout*"],
                    reverse=True
                )
            ])
        ),
        cache_mode=CacheMode.BYPASS,
        stream=True,
        verbose=True
    )

    print("Starting SAFE deep crawl — this will take hours but WILL finish.")
    
    async with AsyncWebCrawler(config=browser_config) as crawler:
        with open(PROCESSED_CSV, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["URL", "Status", "File", "Title"])

            try:
                # THE ONLY CORRECT WAY in Crawl4AI ≥0.16
                stream = await crawler.arun(url=TARGET_URL, config=run_config)

                async for result in stream:
                    if result.success:
                        fname = safe_filename(result.url)
                        path = os.path.join(OUTPUT_DIR, fname)

                        # FIXED: use result.metadata.get("title") instead of result.title
                        title = result.metadata.get("title", "No Title") if result.metadata else "No Title"
                        title = re.sub(r'[\r\n\t]+', ' ', title).strip()

                        with open(path, "w", encoding="utf-8") as f_md:
                            f_md.write(result.markdown or "# No content extracted")

                        print(f"[SUCCESS] {result.url} → {title[:60]}...")
                        writer.writerow([result.url, "Success", fname, title])
                    else:
                        print(f"[FAILED]  {result.url} | {result.error_message[:120]}")
                        writer.writerow([result.url, "Failed", "", ""])

            except KeyboardInterrupt:
                print("\nStopped by user (Ctrl+C)")
            except Exception as e:
                print(f"\nUnexpected error: {e}")
                import traceback
                traceback.print_exc()

    print(f"\nCrawl completed! → {OUTPUT_DIR} | Log: {PROCESSED_CSV}")

if __name__ == "__main__":
    asyncio.run(deep_crawl_and_save())