# building_body_final_nov2025.py import asyncio import os import csv import re import hashlib from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai.deep_crawling import BFSDeepCrawlStrategy from crawl4ai.deep_crawling.filters import FilterChain, DomainFilter, URLPatternFilter # ========================= CONFIG ========================= TARGET_URL = "https://building-body.com/" CRAWL_DEPTH = 50 OUTPUT_DIR = "crawled_content" PROCESSED_CSV = "processed_log.csv" PERSISTENT_PROFILE = True USER_DATA_DIR = "./browser_profile" # ========================= COOKIES (update only if expired) ========================= COOKIES = [ {"name": "PHPSESSID", "value": "Value", "domain": ".building-body.com", "path": "/"}, {"name": "oaid", "value": "Value", "domain": ".building-body.com", "path": "/"}, {"name": "wfwaf-authcookie-b8f3ba21d06bf35f497293c6abf7f126", "value": "238151%7Cadministrator%7Cmanage_options%2Cunfiltered_html%2Cedit_others_posts%2Cupload_files%2Cpublish_posts%2Cedit_posts%2Cread%7C15c9c3df7c953a9430de001024d2749aca010b766064c5736f621cf4113c29ba", "domain": ".building-body.com", "path": "/"} ] def safe_filename(url: str) -> str: clean = re.sub(r'^https?://', '', url.split('?')[0].rstrip('/')) safe = re.sub(r'[^a-zA-Z0-9]', '_', clean)[:100] hash_part = hashlib.sha1(url.encode()).hexdigest()[:8] return f"{safe}_{hash_part}.md" async def deep_crawl_and_save(): os.makedirs(OUTPUT_DIR, exist_ok=True) if PERSISTENT_PROFILE: os.makedirs(USER_DATA_DIR, exist_ok=True) # ======================= BROWSER CONFIG - MAX STEALTH ======================= browser_config = BrowserConfig( headless=True, cookies=COOKIES, use_persistent_context=PERSISTENT_PROFILE, user_data_dir=USER_DATA_DIR if PERSISTENT_PROFILE else None, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", viewport_width=1920, viewport_height=1080, enable_stealth=True, extra_args=["--disable-blink-features=AutomationControlled"], verbose=False ) # ======================= CRAWLER CONFIG - SAFE & SLOW ======================= run_config = CrawlerRunConfig( css_selector="h1, .author, .blog-body, title", semaphore_count=1, # 1 tab only mean_delay=8.0, # 8–12s random delay max_range=4.0, delay_before_return_html=3.0, page_timeout=90000, magic=True, simulate_user=True, deep_crawl_strategy=BFSDeepCrawlStrategy( max_depth=CRAWL_DEPTH, include_external=False, filter_chain=FilterChain([ DomainFilter(allowed_domains=["building-body.com", "www.building-body.com"]), URLPatternFilter( patterns=["*login*", "*register*", "*wp-admin*", "*wp-includes*", "*.jpg", "*.png", "*.gif", "*.pdf", "*logout*"], reverse=True ) ]) ), cache_mode=CacheMode.BYPASS, stream=True, verbose=True ) print("Starting SAFE deep crawl — this will take hours but WILL finish.") async with AsyncWebCrawler(config=browser_config) as crawler: with open(PROCESSED_CSV, "w", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(["URL", "Status", "File", "Title"]) try: # THE ONLY CORRECT WAY in Crawl4AI ≥0.16 stream = await crawler.arun(url=TARGET_URL, config=run_config) async for result in stream: if result.success: fname = safe_filename(result.url) path = os.path.join(OUTPUT_DIR, fname) # FIXED: use result.metadata.get("title") instead of result.title title = result.metadata.get("title", "No Title") if result.metadata else "No Title" title = re.sub(r'[\r\n\t]+', ' ', title).strip() with open(path, "w", encoding="utf-8") as f_md: f_md.write(result.markdown or "# No content extracted") print(f"[SUCCESS] {result.url} → {title[:60]}...") writer.writerow([result.url, "Success", fname, title]) else: print(f"[FAILED] {result.url} | {result.error_message[:120]}") writer.writerow([result.url, "Failed", "", ""]) except KeyboardInterrupt: print("\nStopped by user (Ctrl+C)") except Exception as e: print(f"\nUnexpected error: {e}") import traceback traceback.print_exc() print(f"\nCrawl completed! → {OUTPUT_DIR} | Log: {PROCESSED_CSV}") if __name__ == "__main__": asyncio.run(deep_crawl_and_save())