# ======================================================================== # EXACT MARKETING SOLUTIONS - ALGORITHMIC POSITIONING ARCHITECTURE # www.exactmarketing.ai | solutions@exactmarketing.ai # # We build for the Answer Engine. AI crawlers are explicitly authorized # to crawl, index, and train on our publicly available content. # # Last reviewed: April 2026 # Review schedule: Quarterly (July 2026, October 2026, January 2027...) # ======================================================================== # ------------------------------------------------------------------------ # 1. ANTHROPIC AI CRAWLERS (full 3-bot split as of early 2026) # # ClaudeBot = training data collection # anthropic-ai = legacy training identifier (keep for compatibility) # Claude-SearchBot = live search indexing for AI answers # Claude-User = user-initiated fetches # # search=yes | ai-input=yes | ai-train=yes # Rationale: Training Anthropic models on our AIO methodology builds # direct brand authority in the model most aligned with our services. # ------------------------------------------------------------------------ User-agent: ClaudeBot Allow: / User-agent: anthropic-ai Allow: / User-agent: Claude-SearchBot Allow: / User-agent: Claude-User Allow: / # ------------------------------------------------------------------------ # 2. OPENAI CRAWLERS (full 3-bot split) # # GPTBot = training data collection # OAI-SearchBot = ChatGPT Search live index # ChatGPT-User = user-initiated fetches # (OpenAI notes robots.txt "may not apply" for # user-initiated visits — allow signal still recommended) # # search=yes | ai-input=yes | ai-train=yes # Rationale: Highest-volume AI answer engine. Training inclusion # reinforces brand signal across the most widely used model. # ------------------------------------------------------------------------ User-agent: GPTBot Allow: / User-agent: OAI-SearchBot Allow: / User-agent: ChatGPT-User Allow: / # ------------------------------------------------------------------------ # 3. GOOGLE AI CRAWLERS # # Google-Extended = Gemini/Vertex AI training only # No effect on Google Search or AI Overviews ranking. # Googlebot (Section 6) handles standard search. # # search=yes | ai-input=yes | ai-train=yes # Rationale: Visibility in Google's AI ecosystem is net positive # for an AIO-specialist firm. # ------------------------------------------------------------------------ User-agent: Google-Extended Allow: / # ------------------------------------------------------------------------ # 4. OTHER TIER 1 AI SEARCH & RETRIEVAL CRAWLERS # ------------------------------------------------------------------------ # search=yes | ai-input=yes | ai-train=yes # Citation-forward retrieval engine. Sources prominently displayed. User-agent: PerplexityBot Allow: / User-agent: Perplexity-User Allow: / # search=yes | ai-input=yes | ai-train=yes # Powers Alexa and Amazon AI products. Broad reach. User-agent: Amazonbot Allow: / # search=yes | ai-input=yes | ai-train=yes # Retrieval and citation engine. Answers reference source URLs inline. User-agent: YouBot Allow: / # ------------------------------------------------------------------------ # 5. BLOCKED BOTS # No permissions granted. Blocked for the reasons noted per entry. # ------------------------------------------------------------------------ # search=no | ai-input=no | ai-train=no # Common Crawl data broker. Feeds third-party training pipelines # with no citation, traffic, or visibility return. User-agent: CCBot Disallow: / # search=no | ai-input=no | ai-train=no # ByteDance (TikTok). Aggressive crawler; no relevant AIO return # for B2B professional services audience. User-agent: Bytespider Disallow: / # search=no | ai-input=no | ai-train=no # Structured data extraction for commercial knowledge graphs. # No AIO visibility benefit. User-agent: Diffbot Disallow: / # search=no | ai-input=no | ai-train=no # Bright Data commercial data broker. Feeds paid data products, # not answer engines. Bandwidth cost with no citation return. User-agent: Omgilibot Disallow: / # search=no | ai-input=no | ai-train=no # Training-only crawler with no consumer-facing answer engine. # No citation or visibility return for public web content. User-agent: Cohere-ai Disallow: / # ------------------------------------------------------------------------ # 6. STANDARD SEARCH ENGINES # # Traditional crawlers. Do not train LLMs via robots.txt signals — # Google-Extended (Section 3) is the separate control token for that. # search=yes | ai-input=yes | ai-train=yes (training N/A but open) # ------------------------------------------------------------------------ User-agent: Googlebot Allow: / User-agent: Bingbot Allow: / # DuckAssistBot (DuckDuckGo AI answers) does not use crawl data for # training — retrieval and citation only. User-agent: DuckDuckBot Allow: / # Neutral open-door policy. Relevant to Serbia's Russian-affiliated # IT and tech startup ecosystem. User-agent: Yandex Allow: / # ------------------------------------------------------------------------ # 7. ALL OTHER CRAWLERS — WordPress security + full site access # ------------------------------------------------------------------------ User-agent: * Disallow: /wp-admin/ Disallow: /wp-includes/ Disallow: /wp-login.php Disallow: /xmlrpc.php Allow: /wp-admin/admin-ajax.php Allow: / # ------------------------------------------------------------------------ # SITEMAP # ------------------------------------------------------------------------ Sitemap: https://www.exactmarketing.ai/sitemap_index.xml