~liljamo/robots.txt: wip: initial

1 files changed, 75 insertions(+), 0 deletions(-)

A robots.txt

A  => robots.txt +75 -0

@@ 1,75 @@
+# TODO: Hahaa!
+#       Maybe disallow everything for... everything (/),
+#       but allow just google to index it?
+#       Then track the bot user-agents, and disallow manually those that don't respect *!
+
+
+
+# Parts of this are from the very well put together sr.ht/robots.txt,
+#  especially the part below regarding Sourcehut paths
+# Others are handpicked
+
+# For src.quest
+User-agent: *
+Disallow: /*?*
+Disallow: /*.tar.gz$
+Disallow: /metrics
+Disallow: /*/*/blame/*
+Disallow: /*/*/log/*
+Disallow: /*/*/tree/*
+Disallow: /*/*/item/*
+Disallow: /*/*/mbox
+Disallow: /*/*/*/raw
+
+# Marketing/SEO
+User-agent: SemrushBot
+Disallow: /
+User-agent: SemrushBot-SA
+Disallow: /
+User-agent: AhrefsBot
+Disallow: /
+User-agent: dotbot
+Disallow: /
+User-agent: rogerbot
+Disallow: /
+User-agent: BLEXBot
+Disallow: /
+User-agent: ZoominfoBot
+Disallow: /
+User-agent: Yandex
+Disallow: /
+User-agent: MJ12bot
+Disallow: /
+User-agent: DataForSeoBot
+Disallow: /
+User-agent: 
+Disallow: /
+
+# Not my jam
+User-agent: GPTBot
+Disallow: /
+
+# Some Huawei related AI services, no thanks
+User-agent: PetalBot
+Disallow: /
+User-agent: AspiegelBot
+Disallow: /
+
+# Alexa, nope
+User-agent: Amazonbot
+Disallow: /
+
+# Nnnnnope
+User-agent: turnitinbot
+Disallow: /
+User-agent: Turnitin
+Disallow: /
+
+# 80legs crawler
+User-agent: 008
+Disallow: /
+
+# Apparently doesn't respect *
+User-agent: Seekport Crawler
+Disallow: /
+