# As a condition of accessing this website, you agree to abide by the following
# content signals:

# (a)  If a content-signal = yes, you may collect content for the corresponding
#      use.
# (b)  If a content-signal = no, you may not collect content for the
#      corresponding use.
# (c)  If the website operator does not include a content signal for a
#      corresponding use, the website operator neither grants nor restricts
#      permission via content signal with respect to the corresponding use.

# The content signals and their meanings are:

# search:   building a search index and providing search results (e.g., returning
#           hyperlinks and short excerpts from your website's contents). Search does not
#           include providing AI-generated search summaries.
# ai-input: inputting content into one or more AI models (e.g., retrieval
#           augmented generation, grounding, or other real-time taking of content for
#           generative AI search answers).
# ai-train: training or fine-tuning AI models.

# ANY RESTRICTIONS EXPRESSED VIA CONTENT SIGNALS ARE EXPRESS RESERVATIONS OF
# RIGHTS UNDER ARTICLE 4 OF THE EUROPEAN UNION DIRECTIVE 2019/790 ON COPYRIGHT
# AND RELATED RIGHTS IN THE DIGITAL SINGLE MARKET.

# BEGIN Cloudflare Managed content

User-Agent: *
Content-signal: search=yes,ai-train=no
Allow: /

User-agent: Amazonbot
Disallow: /

User-agent: Applebot-Extended
Disallow: /

User-agent: Bytespider
Disallow: /

User-agent: CCBot
Disallow: /

User-agent: ClaudeBot
Disallow: /

User-agent: Google-Extended
Disallow: /

User-agent: GPTBot
Disallow: /

User-agent: meta-externalagent
Disallow: /

# END Cloudflare Managed Content

# robots.txt for http://www.tunearch.org/ and friends
#
# Please note: There are a lot of pages on this site, and there are
# some misbehaved spiders out there that go _way_ too fast. If you're
# irresponsible, your access to the site may be blocked.
#


# Observed spamming large amounts of https://en.wikipedia.org/?curid=NNNNNN
# and ignoring 429 ratelimit responses, claims to respect robots:
# http://mj12bot.com/

# === AI CRAWLERS ===
User-agent: anthropic-ai
Disallow: /

User-agent: ClaudeBot
Disallow: /

User-agent: Claude-Web
Disallow: /

User-agent: ChatGPT-User
Disallow: /

User-agent: GPTBot
Disallow: /

User-agent: PerplexityBot
Disallow: /

User-agent: cohere-ai
Disallow: /

User-agent: Google-Extended
Disallow: /

User-agent: FacebookBot
Disallow: /

User-agent: Meta-ExternalAgent
Disallow: /

User-agent: Timpibot
Disallow: /

User-agent: FriendlyCrawler
Disallow: /

User-agent: Image2dataset
Disallow: /

User-agent: ImagesiftBot
Disallow: /

# === SEO CRAWLERS ===
User-agent: AhrefsBot
Disallow: /

User-agent: Barkrowler
Disallow: /

User-agent: BLEXBot
Disallow: /

User-agent: MJ12bot
Disallow: /

User-agent: DataForSeoBot
Disallow: /

User-agent: DotBot
Disallow: /

User-agent: SemrushBot
Disallow: /

# === SEARCH ENGINES & DATA ===
User-agent: Yandex
Disallow: /

User-agent: PetalBot
Disallow: /

User-agent: Amazonbot
Disallow: /

User-agent: Bytespider
Disallow: /

# === SECURITY & NETWORK SCANNERS ===
User-agent: CensysInspect
Disallow: /

User-agent: Expanse
Disallow: /

User-agent: internet-measurement
Disallow: /

# === GENERIC / ABUSIVE ===
User-agent: Scrapy
Disallow: /

User-agent: python-requests
Disallow: /

User-agent: Java
Disallow: /

User-agent: Go-http-client
Disallow: /

User-agent: news-please
Disallow: /

User-agent: Dataprovider
Disallow: /

User-agent: Orbbot
Disallow: /

User-agent: IonCrawl
Disallow: /

User-agent: ISSCyberRiskCrawler
Disallow: /

User-agent: VelenPublicWebCrawler
Disallow: /

User-agent: peer39_crawler
Disallow: /

User-agent: Zoominfobot
Disallow: /

User-agent: wp_is_mobile
Disallow: /

# Misbehaving: requests much too fast:
User-agent: fast
Disallow: /

#
# Sorry, wget in its recursive mode is a frequent problem.
# Please read the man page and use it properly; there is a
# --wait option you can use to set the delay between hits,
# for instance.
#
User-agent: wget
Disallow: /

#
# The 'grub' distributed client has been *very* poorly behaved.
#
User-agent: grub-client
Disallow: /


#
# Friendly, low-speed bots are welcome viewing article pages, but not
# dynamically-generated pages please.
#
# Inktomi's "Slurp" can read a minimum delay between hits; if your
# bot supports such a thing using the 'Crawl-delay' or another
# instruction, please let us know.
#
# There is a special exception for API mobileview to allow dynamic
# mobile web & app views to load section content.
# These views aren't HTTP-cached but use parser cache aggressively
# and don't expose special: pages etc.
#
# Another exception is for REST API documentation, located at
# /api/rest_v1/?doc.
#
User-agent: *
Sitemap: https://tunearch.org/w/sitemap/tta.xml
Allow: /w/sitemap/
Allow: /w/api.php?action=mobileview&
Allow: /w/load.php?
Allow: /api/rest_v1/?doc
Disallow: /w/
Disallow: /api/
Disallow: /trap/
Disallow: /wiki/Special:
Disallow: /wiki/User:
Disallow: /wiki/User_talk:
Disallow: /wiki/MediaWiki:
Disallow: /wiki/MediaWiki_talk:
Disallow: /wiki/Template:
Disallow: /wiki/Template_talk: