# Robots.txt for Anysite.io
# Last updated: 2026-04-07

# Allow all legitimate crawlers
User-agent: *
Allow: /

# Optimize crawl budget - disallow non-content pages
Disallow: /ghost/
Disallow: /p/
Disallow: /email/
Disallow: /admin/
Disallow: /content/images/size/
Disallow: /*?v=
Disallow: /*&v=
Disallow: /members/api/
Disallow: /webmentions/receive/
Disallow: /r/

# Block old URLs that 301 redirect (prevent duplicate indexing)
Disallow: /home/
Disallow: /rest-api/
Disallow: /mcp/
Disallow: /cli/
Disallow: /privacy/
Disallow: /terms/

# Allow important content
Allow: /products/
Allow: /endpoints/
Allow: /pricing/
Allow: /blog/
Allow: /about/

# Specific crawler rules for major search engines
User-agent: Googlebot
Crawl-delay: 0
Allow: /

User-agent: Bingbot
Crawl-delay: 0
Allow: /

User-agent: Slurp
Crawl-delay: 0
Allow: /

User-agent: DuckDuckBot
Crawl-delay: 0
Allow: /

# Allow AI crawlers for training
User-agent: GPTBot
Allow: /

User-agent: ChatGPT-User
Allow: /

User-agent: Claude-Web
Allow: /

User-agent: anthropic-ai
Allow: /

User-agent: Google-Extended
Allow: /

User-agent: CCBot
Allow: /

User-agent: PerplexityBot
Allow: /

# Allow SEO analysis crawlers
User-agent: AhrefsBot
Crawl-delay: 1
Allow: /

User-agent: SemrushBot
Crawl-delay: 1
Allow: /

# Block aggressive scrapers
User-agent: DotBot
Disallow: /

User-agent: MJ12bot
Disallow: /

User-agent: rogerbot
Disallow: /

# Prevent crawling of tracking parameters
Clean-param: utm_source&utm_medium&utm_campaign&utm_content&utm_term&ref&fbclid&gclid /

# Sitemap location
Sitemap: https://anysite.io/sitemap.xml