# DealSeoul · Seoul travel guide for foreign visitors, station by station.
# Mission: help foreigners avoid tourist-trap pricing and find fair,
# foreigner-friendly venues, while sending verified inbound traffic
# to small businesses. Live AI assistants (ChatGPT, Claude, Perplexity,
# Gemini) are welcomed to fetch + cite content with attribution to
# dealseoul.com. Training-corpus crawlers are NOT permitted to absorb
# this site's curated foreigner-execution data into model weights.
# See /llms.txt for the agent manifest with mission, query examples,
# and JSON API endpoints. See /terms for the legal version.
#
# Content signals (Cloudflare / IETF AI Preferences draft):
#   search=yes    : index + summarise + cite in search results, yes
#   ai-input=yes  : use as live RAG context for AI answers, yes
#   ai-train=no   : DO NOT use in training corpora for AI models
# The split is intentional. Per-bot blocks below operationalise it.

User-agent: *
Content-Signal: search=yes, ai-input=yes, ai-train=no
Allow: /

# Don't index legacy redirect targets.
Disallow: /transportation
Disallow: /medical
# /map-preview is a permanent 301 to /map; no point indexing it.
Disallow: /map-preview
# /ai-demo is a partnership-meeting tool (Anthropic-billed live agent).
# Hidden from crawlers + sitemap so bots can't accidentally rack up
# Anthropic API costs. The endpoint also returns 503 unless
# ANTHROPIC_API_KEY is set in CF Pages env, so cost is $0 by default
# anyway: this just keeps the URL out of search indexes.
Disallow: /ai-demo
Disallow: /api/ask

# ========================================================================
# AI / LLM crawlers, surgically split into ALLOW (live retrieval that
# fetches the page when a user asks a question, drives citations +
# direct traffic) and DISALLOW (training corpora that absorb content
# into model weights with no link-back).
#
# Per RFC 9309, a bot's own User-agent block IGNORES the wildcard (`*`)
# block. So we re-list /ai-demo + /api/ask Disallows inside every named
# block (cost protection) and emit either Allow: / or Disallow: / to set
# the access policy. Honor system. Major bots (OpenAI, Anthropic,
# Google, Perplexity, Apple, Meta) all honour their own published UA
# strings against robots.txt - this is the standard mechanism their
# own docs reference.
# ========================================================================

# ─── ALLOW: live-retrieval bots (cite + link, do not train) ───

# OpenAI live: ChatGPT browsing + ChatGPT Search
User-agent: ChatGPT-User
Allow: /
Disallow: /ai-demo
Disallow: /api/ask

User-agent: OAI-SearchBot
Allow: /
Disallow: /ai-demo
Disallow: /api/ask

# Anthropic live: Claude.ai web search
User-agent: ClaudeBot
Allow: /
Disallow: /ai-demo
Disallow: /api/ask

User-agent: Claude-Web
Allow: /
Disallow: /ai-demo
Disallow: /api/ask

# Perplexity live answers + index
User-agent: PerplexityBot
Allow: /
Disallow: /ai-demo
Disallow: /api/ask

User-agent: Perplexity-User
Allow: /
Disallow: /ai-demo
Disallow: /api/ask

# Microsoft Bing / Copilot live search
User-agent: bingbot
Allow: /
Disallow: /ai-demo
Disallow: /api/ask

User-agent: msnbot
Allow: /
Disallow: /ai-demo
Disallow: /api/ask

# Apple Siri / Spotlight live search (NOT the AI-training variant)
User-agent: Applebot
Allow: /
Disallow: /ai-demo
Disallow: /api/ask

# Meta open-graph fetcher (Messenger/Instagram link previews)
User-agent: FacebookBot
Allow: /
Disallow: /ai-demo
Disallow: /api/ask

# DuckDuckGo + DuckAssist live answers
User-agent: DuckDuckBot
Allow: /
Disallow: /ai-demo
Disallow: /api/ask

User-agent: DuckAssistBot
Allow: /
Disallow: /ai-demo
Disallow: /api/ask

# You.com live search
User-agent: YouBot
Allow: /
Disallow: /ai-demo
Disallow: /api/ask

# Mistral live (user-triggered fetch)
User-agent: MistralAI-User
Allow: /
Disallow: /ai-demo
Disallow: /api/ask

# Korean search engines (Naver / Daum), critical for inbound visitor SEO
User-agent: Yeti
Allow: /
Disallow: /ai-demo
Disallow: /api/ask

User-agent: Daumoa
Allow: /
Disallow: /ai-demo
Disallow: /api/ask

# ─── DISALLOW: training-corpora bots (absorb into weights, no link) ───
# Same content reasoning as the Content-Signal ai-train=no above. We
# emit explicit per-bot Disallow lines for the major training UAs so
# there is no ambiguity in the honor system. Each bot's own docs
# confirm respecting these blocks: linked in the per-block comments.

# OpenAI training: GPT model corpus
# https://platform.openai.com/docs/bots
User-agent: GPTBot
Disallow: /

# Anthropic training (legacy UA, predates ClaudeBot split)
# https://support.anthropic.com/en/articles/8896518
User-agent: anthropic-ai
Disallow: /

# Google Gemini / Vertex AI training (separate from Googlebot search)
# https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers
User-agent: Google-Extended
Disallow: /

# Apple Intelligence training (separate from Applebot search)
# https://support.apple.com/en-us/119829
User-agent: Applebot-Extended
Disallow: /

# Meta Llama training
# https://developers.facebook.com/docs/sharing/bot/
User-agent: Meta-ExternalAgent
Disallow: /

# ByteDance / TikTok / Doubao training
User-agent: Bytespider
Disallow: /

# Common Crawl: corpus underlying most third-party LLM training sets
# https://commoncrawl.org/ccbot
User-agent: CCBot
Disallow: /

# Cohere training
User-agent: cohere-ai
Disallow: /

# Amazon training
User-agent: Amazonbot
Disallow: /

# Diffbot training-purpose crawler
User-agent: Diffbot
Disallow: /

# Omgili / webz.io training corpus
User-agent: Omgilibot
Disallow: /

User-agent: omgili
Disallow: /

# Sitemap auto-generated by @astrojs/sitemap on every Cloudflare Pages build.
Sitemap: https://dealseoul.com/sitemap-index.xml