# robots.txt for https://www.operabase.com
# Strategy: Authoritative AI visibility — allow search & retrieval,
#            block pure training & commercial scrapers
# Last updated: 2026-04

# ============================================================
# SECTION 1: PURE TRAINING CRAWLERS
# Block: take data into model weights, no citation, no traffic back
# ============================================================

# OpenAI training (distinct from OAI-SearchBot which cites sources)
User-agent: GPTBot
Disallow: /

# Anthropic training (Claude-SearchBot and Claude-User allowed via * rule)
# Note: anthropic-ai and claude-web are deprecated pre-2024 tokens — intentionally omitted
User-agent: ClaudeBot
Disallow: /

# Common Crawl — upstream source for most open LLM training datasets
# Blocking this is the broadest possible training opt-out
User-agent: CCBot
Disallow: /

# Meta AI training — no citation, no traffic value
# NOTE: do NOT block facebookexternalhit — that handles link previews
User-agent: meta-externalagent
Disallow: /

# Google AI training opt-out token (not a crawler — signals Googlebot
# to exclude content from Gemini training while keeping search ranking)
User-agent: Google-Extended
Disallow: /

# Apple Intelligence training opt-out token (not a crawler —
# keeps Siri/Spotlight/Safari Suggestions while opting out of training)
User-agent: Applebot-Extended
Disallow: /

# Amazon — training crawler for Nova LLM; also used for Alexa/Spotlight indexing.
# Blocked here for consistency with training opt-out strategy.
# To allow Alexa/Spotlight indexing, remove this block.
User-agent: Amazonbot
Disallow: /

# ============================================================
# SECTION 2: COMMERCIAL SCRAPERS & DATA EXTRACTORS
# No citation, no traffic — pure commercial extraction
# ============================================================

User-agent: Diffbot
Disallow: /

User-agent: DataForSeoBot
Disallow: /

User-agent: Scrapy
Disallow: /

User-agent: omgili
Disallow: /

User-agent: omgilibot
Disallow: /

# Omgili rebranded — new user agent replacing omgilibot for AI training pipeline
User-agent: webzio-extended
Disallow: /

# Scraper-as-a-service platform; multi-tenant so blocking affects all downstream customers
User-agent: FirecrawlAgent
Disallow: /

# Academic plagiarism detection — no citation, no traffic value
User-agent: TurnitinBot
Disallow: /

# ============================================================
# SECTION 3: LOW-TRUST / UNDOCUMENTED CRAWLERS
# No official documentation, no verifiable compliance
# ============================================================

# ByteDance — no official documentation page, no confirmed robots.txt compliance
User-agent: Bytespider
Disallow: /

User-agent: AliyunSecBot
Disallow: /

User-agent: AlibabaBot
Disallow: /

User-agent: Eyeotabot
Disallow: /

# Chinese AI lab — no robots.txt compliance documentation
User-agent: DeepSeekBot
Disallow: /

# No official vendor documentation; purpose unconfirmed
User-agent: cohere-training-data-crawler
Disallow: /

# Low EU/DK relevance search engines
User-agent: BaiduSpider
Disallow: /

User-agent: SeznamBot
Disallow: /

User-agent: PetalBot
Disallow: /

# ============================================================
# SECTION 4: ALL LEGITIMATE CRAWLERS
# Explicitly allowed via * rule (retrieval/citation value):
#   Googlebot, Bingbot, Applebot, facebookexternalhit,
#   OAI-SearchBot, ChatGPT-User,
#   Claude-SearchBot, Claude-User,
#   PerplexityBot, DuckAssistBot,
#   MistralAI-User,
#   Google-Agent, Gemini-Deep-Research, Google-NotebookLM,
#   GoogleAgent-Mariner, GoogleAgent-URLContext, Google-Firebase
#
# Note: Google's user-triggered fetchers (Google-Agent, Gemini-Deep-Research etc.)
# may ignore robots.txt per Google's stated policy for user-initiated requests.
# ============================================================

User-agent: *
Content-Signal: search=yes, ai-input=yes, ai-train=no
Allow: /
Disallow: /login/
Disallow: /seo/
Disallow: /register/
Disallow: /settings/

# ============================================================
# SECTION 5: SITEMAPS
# ============================================================

Sitemap: https://www.operabase.com/sitemap_static.xml
Sitemap: https://www.operabase.com/sitemap_professions.xml
Sitemap: https://www.operabase.com/sitemap_artists.xml
Sitemap: https://www.operabase.com/sitemap_organizationtypes.xml
Sitemap: https://www.operabase.com/sitemap_organizations.xml
Sitemap: https://www.operabase.com/sitemap_performances.xml
Sitemap: https://www.operabase.com/sitemap_productions.xml
Sitemap: https://www.operabase.com/sitemap_works.xml