DEVELOPMENT ENVIRONMENT

~liljamo/robots.txt

413b11c0419d46d0cdb04ddd3659ad30754e7dc1 — Jonni Liljamo 4 months ago
wip: initial
1 files changed, 75 insertions(+), 0 deletions(-)

A robots.txt
A  => robots.txt +75 -0
@@ 1,75 @@
# TODO: Hahaa!
#       Maybe disallow everything for... everything (/),
#       but allow just google to index it?
#       Then track the bot user-agents, and disallow manually those that don't respect *!



# Parts of this are from the very well put together sr.ht/robots.txt,
#  especially the part below regarding Sourcehut paths
# Others are handpicked

# For src.quest
User-agent: *
Disallow: /*?*
Disallow: /*.tar.gz$
Disallow: /metrics
Disallow: /*/*/blame/*
Disallow: /*/*/log/*
Disallow: /*/*/tree/*
Disallow: /*/*/item/*
Disallow: /*/*/mbox
Disallow: /*/*/*/raw

# Marketing/SEO
User-agent: SemrushBot
Disallow: /
User-agent: SemrushBot-SA
Disallow: /
User-agent: AhrefsBot
Disallow: /
User-agent: dotbot
Disallow: /
User-agent: rogerbot
Disallow: /
User-agent: BLEXBot
Disallow: /
User-agent: ZoominfoBot
Disallow: /
User-agent: Yandex
Disallow: /
User-agent: MJ12bot
Disallow: /
User-agent: DataForSeoBot
Disallow: /
User-agent: 
Disallow: /

# Not my jam
User-agent: GPTBot
Disallow: /

# Some Huawei related AI services, no thanks
User-agent: PetalBot
Disallow: /
User-agent: AspiegelBot
Disallow: /

# Alexa, nope
User-agent: Amazonbot
Disallow: /

# Nnnnnope
User-agent: turnitinbot
Disallow: /
User-agent: Turnitin
Disallow: /

# 80legs crawler
User-agent: 008
Disallow: /

# Apparently doesn't respect *
User-agent: Seekport Crawler
Disallow: /