M .gitignore => .gitignore +1 -0
@@ 1,1 1,2 @@
+/out/
/result
A README.md => README.md +18 -0
@@ 0,0 1,18 @@
+# robots.txt
+
+This repository contains my very opinionated (and possibly overengineered)
+robots.txt generation.
+
+`lists` contains lists of user agents to disallow at root
+ * .txt files, line by line listings of user agents
+ * empty lines are ignored
+ * lines starting with # are ignored
+
+`bases` contains site specific base robots.txt files
+ * base robots.txt files in the robots.txt format
+ * names represent the domains they're served at
+
+`generate.sh` is a bash script for generating output robots.txt files
+ * arg $1 is a required path to the out file
+ * arg $2 is an optional path to a base file
+
A bases/src.quest => bases/src.quest +10 -0
@@ 0,0 1,10 @@
+User-agent: *
+Disallow: /*?*
+Disallow: /*.tar.gz$
+Disallow: /metrics
+Disallow: /*/*/blame/*
+Disallow: /*/*/log/*
+Disallow: /*/*/tree/*
+Disallow: /*/*/item/*
+Disallow: /*/*/mbox
+Disallow: /*/*/*/raw
A generate.sh => generate.sh +29 -0
@@ 0,0 1,29 @@
+#!/usr/bin/env bash
+
+[ -z "$1" ] && echo "no out file supplied" && exit 1
+out="$1"
+
+base="$2"
+
+# check if base was given
+if [ ! -z $base ]; then
+ # check if base exists
+ [ ! -e $base ] && echo "invalid base" && exit 1
+ # copy base to out
+ cp $base $out
+fi
+
+# loop lists
+for filename in ./lists/*.txt; do
+ # line by line
+ while read -r line; do
+ # ignore empty
+ [ -z "$line" ] && continue
+ # ignore comments
+ [[ $line =~ ^#.* ]] && continue
+
+ # add to out
+ echo "User-agent: $line" >> $out
+ echo "Disallow: /" >> $out
+ done < $filename
+done
A lists/ai.txt => lists/ai.txt +31 -0
@@ 0,0 1,31 @@
+360Spider
+
+AI2Bot
+Applebot
+Applebot-Extended
+AspiegelBot
+Bytespider
+CCBot
+ChatGPT-User
+Claude-Web
+ClaudeBot
+Diffbot
+FacebookBot
+GPTBot
+Google-Extended
+GoogleOther
+ImagesiftBot
+Kangaroo Bot
+Meta-ExternalFetcher
+OAI-SearchBot
+Omigili
+OmigiliBot
+PerplexityBot
+PerplexityBot
+PetalBot
+Timpibot
+Webzio-Extended
+YouBot
+
+anthropic-ai
+cohere-ai
A lists/marketingseo.txt => lists/marketingseo.txt +12 -0
@@ 0,0 1,12 @@
+AhrefsBot
+BLEXBot
+DataForSeoBot
+MJ12bot
+SemrushBot
+SemrushBot-SA
+ZoominfoBot
+
+barkrowler
+dotbot
+rogerbot
+serpstatbot
A lists/other.txt => lists/other.txt +13 -0
@@ 0,0 1,13 @@
+# Apparently doesn't respect *
+Seekport Crawler
+
+# Alexa, what is my current location?
+Amazonbot
+meta-externalagent
+
+# Meh
+turnitinbot
+Turnitin
+
+# 80legs crawler
+008
A lists/searchengine.txt => lists/searchengine.txt +2 -0
@@ 0,0 1,2 @@
+AlexandriaOrgBot
+Yandex
D robots.txt => robots.txt +0 -75
@@ 1,75 0,0 @@
-# TODO: Hahaa!
-# Maybe disallow everything for... everything (/),
-# but allow just google to index it?
-# Then track the bot user-agents, and disallow manually those that don't respect *!
-
-
-
-# Parts of this are from the very well put together sr.ht/robots.txt,
-# especially the part below regarding Sourcehut paths
-# Others are handpicked
-
-# For src.quest
-User-agent: *
-Disallow: /*?*
-Disallow: /*.tar.gz$
-Disallow: /metrics
-Disallow: /*/*/blame/*
-Disallow: /*/*/log/*
-Disallow: /*/*/tree/*
-Disallow: /*/*/item/*
-Disallow: /*/*/mbox
-Disallow: /*/*/*/raw
-
-# Marketing/SEO
-User-agent: SemrushBot
-Disallow: /
-User-agent: SemrushBot-SA
-Disallow: /
-User-agent: AhrefsBot
-Disallow: /
-User-agent: dotbot
-Disallow: /
-User-agent: rogerbot
-Disallow: /
-User-agent: BLEXBot
-Disallow: /
-User-agent: ZoominfoBot
-Disallow: /
-User-agent: Yandex
-Disallow: /
-User-agent: MJ12bot
-Disallow: /
-User-agent: DataForSeoBot
-Disallow: /
-User-agent:
-Disallow: /
-
-# Not my jam
-User-agent: GPTBot
-Disallow: /
-
-# Some Huawei related AI services, no thanks
-User-agent: PetalBot
-Disallow: /
-User-agent: AspiegelBot
-Disallow: /
-
-# Alexa, nope
-User-agent: Amazonbot
-Disallow: /
-
-# Nnnnnope
-User-agent: turnitinbot
-Disallow: /
-User-agent: Turnitin
-Disallow: /
-
-# 80legs crawler
-User-agent: 008
-Disallow: /
-
-# Apparently doesn't respect *
-User-agent: Seekport Crawler
-Disallow: /
-