# robots.txt for https://www.intrasec.ca
#
# Policy: welcome all well-behaved crawlers, including AI / LLM crawlers, to
# index and cite the site. The explicit AI user-agent groups below are
# documentation of intent: access is already granted by the wildcard rule, but
# naming them makes the policy unambiguous and gives one place to adjust AI
# access in future. See also /llms.txt (curated map for AI assistants).
#
# Content pages that should not be indexed use an on-page
# <meta name="robots" content="noindex"> tag, never a Disallow here: a Disallow
# blocks crawling and would stop Google from ever seeing the noindex directive.
# The ONE exception is /api/ (Disallow below): those are POST-only JSON endpoints,
# not crawlable pages, carry no noindex tag, and should not be crawled at all, so
# Disallow is the correct tool and it scopes to that path only (no page's noindex
# is affected). robots.txt groups do NOT inherit each other, so every named group
# below repeats the same Disallow: /api/.

User-agent: *
Allow: /
# API endpoints are POST-only JSON (e.g. /api/contact, /api/scan/*); a crawler GET
# returns 405. Not pages, never meant to be indexed.
Disallow: /api/

# ── AI / LLM crawlers ──
# OpenAI (ChatGPT): training crawl, live search fetch, user-triggered fetch
User-agent: GPTBot
Allow: /
Disallow: /api/

User-agent: OAI-SearchBot
Allow: /
Disallow: /api/

User-agent: ChatGPT-User
Allow: /
Disallow: /api/

# Anthropic (Claude): crawl + user-triggered fetch
User-agent: ClaudeBot
Allow: /
Disallow: /api/

User-agent: anthropic-ai
Allow: /
Disallow: /api/

User-agent: Claude-Web
Allow: /
Disallow: /api/

# Perplexity: index crawl + user-triggered fetch
User-agent: PerplexityBot
Allow: /
Disallow: /api/

User-agent: Perplexity-User
Allow: /
Disallow: /api/

# Google Gemini / AI Overviews (content-use opt-in token)
User-agent: Google-Extended
Allow: /
Disallow: /api/

# Apple Intelligence (content-use opt-in token)
User-agent: Applebot-Extended
Allow: /
Disallow: /api/

# Microsoft Copilot (uses the Bing index)
User-agent: Bingbot
Allow: /
Disallow: /api/

# Common Crawl (open dataset many LLMs train on)
User-agent: CCBot
Allow: /
Disallow: /api/

Sitemap: https://www.intrasec.ca/sitemap.xml