# ========================================================================
# EXACT MARKETING SOLUTIONS - ALGORITHMIC POSITIONING ARCHITECTURE
# www.exactmarketing.ai | solutions@exactmarketing.ai
#
# We build for the Answer Engine. AI crawlers are explicitly authorized
# to crawl, index, and train on our publicly available content.
#
# Last reviewed: April 2026
# Review schedule: Quarterly (July 2026, October 2026, January 2027...)
# ========================================================================


# ------------------------------------------------------------------------
# 1. ANTHROPIC AI CRAWLERS (full 3-bot split as of early 2026)
#
#    ClaudeBot        = training data collection
#    anthropic-ai     = legacy training identifier (keep for compatibility)
#    Claude-SearchBot = live search indexing for AI answers
#    Claude-User      = user-initiated fetches
#
#    search=yes | ai-input=yes | ai-train=yes
#    Rationale: Training Anthropic models on our AIO methodology builds
#    direct brand authority in the model most aligned with our services.
# ------------------------------------------------------------------------
User-agent: ClaudeBot
Allow: /

User-agent: anthropic-ai
Allow: /

User-agent: Claude-SearchBot
Allow: /

User-agent: Claude-User
Allow: /


# ------------------------------------------------------------------------
# 2. OPENAI CRAWLERS (full 3-bot split)
#
#    GPTBot        = training data collection
#    OAI-SearchBot = ChatGPT Search live index
#    ChatGPT-User  = user-initiated fetches
#                   (OpenAI notes robots.txt "may not apply" for
#                   user-initiated visits — allow signal still recommended)
#
#    search=yes | ai-input=yes | ai-train=yes
#    Rationale: Highest-volume AI answer engine. Training inclusion
#    reinforces brand signal across the most widely used model.
# ------------------------------------------------------------------------
User-agent: GPTBot
Allow: /

User-agent: OAI-SearchBot
Allow: /

User-agent: ChatGPT-User
Allow: /


# ------------------------------------------------------------------------
# 3. GOOGLE AI CRAWLERS
#
#    Google-Extended = Gemini/Vertex AI training only
#                      No effect on Google Search or AI Overviews ranking.
#                      Googlebot (Section 6) handles standard search.
#
#    search=yes | ai-input=yes | ai-train=yes
#    Rationale: Visibility in Google's AI ecosystem is net positive
#    for an AIO-specialist firm.
# ------------------------------------------------------------------------
User-agent: Google-Extended
Allow: /


# ------------------------------------------------------------------------
# 4. OTHER TIER 1 AI SEARCH & RETRIEVAL CRAWLERS
# ------------------------------------------------------------------------

# search=yes | ai-input=yes | ai-train=yes
# Citation-forward retrieval engine. Sources prominently displayed.
User-agent: PerplexityBot
Allow: /

User-agent: Perplexity-User
Allow: /

# search=yes | ai-input=yes | ai-train=yes
# Powers Alexa and Amazon AI products. Broad reach.
User-agent: Amazonbot
Allow: /

# search=yes | ai-input=yes | ai-train=yes
# Retrieval and citation engine. Answers reference source URLs inline.
User-agent: YouBot
Allow: /


# ------------------------------------------------------------------------
# 5. BLOCKED BOTS
#    No permissions granted. Blocked for the reasons noted per entry.
# ------------------------------------------------------------------------

# search=no | ai-input=no | ai-train=no
# Common Crawl data broker. Feeds third-party training pipelines
# with no citation, traffic, or visibility return.
User-agent: CCBot
Disallow: /

# search=no | ai-input=no | ai-train=no
# ByteDance (TikTok). Aggressive crawler; no relevant AIO return
# for B2B professional services audience.
User-agent: Bytespider
Disallow: /

# search=no | ai-input=no | ai-train=no
# Structured data extraction for commercial knowledge graphs.
# No AIO visibility benefit.
User-agent: Diffbot
Disallow: /

# search=no | ai-input=no | ai-train=no
# Bright Data commercial data broker. Feeds paid data products,
# not answer engines. Bandwidth cost with no citation return.
User-agent: Omgilibot
Disallow: /

# search=no | ai-input=no | ai-train=no
# Training-only crawler with no consumer-facing answer engine.
# No citation or visibility return for public web content.
User-agent: Cohere-ai
Disallow: /


# ------------------------------------------------------------------------
# 6. STANDARD SEARCH ENGINES
#
#    Traditional crawlers. Do not train LLMs via robots.txt signals —
#    Google-Extended (Section 3) is the separate control token for that.
#    search=yes | ai-input=yes | ai-train=yes (training N/A but open)
# ------------------------------------------------------------------------
User-agent: Googlebot
Allow: /

User-agent: Bingbot
Allow: /

# DuckAssistBot (DuckDuckGo AI answers) does not use crawl data for
# training — retrieval and citation only.
User-agent: DuckDuckBot
Allow: /

# Neutral open-door policy. Relevant to Serbia's Russian-affiliated
# IT and tech startup ecosystem.
User-agent: Yandex
Allow: /


# ------------------------------------------------------------------------
# 7. ALL OTHER CRAWLERS — WordPress security + full site access
# ------------------------------------------------------------------------
User-agent: *
Disallow: /wp-admin/
Disallow: /wp-includes/
Disallow: /wp-login.php
Disallow: /xmlrpc.php
Allow: /wp-admin/admin-ajax.php
Allow: /


# ------------------------------------------------------------------------
# SITEMAP
# ------------------------------------------------------------------------
Sitemap: https://www.exactmarketing.ai/sitemap_index.xml