~liljamo/robots.txt: feat: robots.txt generation

9 files changed, 116 insertions(+), 75 deletions(-)

M .gitignore
A README.md
A bases/src.quest
A generate.sh
A lists/ai.txt
A lists/marketingseo.txt
A lists/other.txt
A lists/searchengine.txt
D robots.txt

M .gitignore => .gitignore +1 -0

@@ 1,1 1,2 @@
+/out/
 /result

A README.md => README.md +18 -0

@@ 0,0 1,18 @@
+# robots.txt
+
+This repository contains my very opinionated (and possibly overengineered)
+robots.txt generation.
+
+`lists` contains lists of user agents to disallow at root
+ * .txt files, line by line listings of user agents
+ * empty lines are ignored
+ * lines starting with # are ignored
+
+`bases` contains site specific base robots.txt files
+ * base robots.txt files in the robots.txt format
+ * names represent the domains they're served at
+
+`generate.sh` is a bash script for generating output robots.txt files
+ * arg $1 is a required path to the out file
+ * arg $2 is an optional path to a base file
+

A bases/src.quest => bases/src.quest +10 -0

@@ 0,0 1,10 @@
+User-agent: *
+Disallow: /*?*
+Disallow: /*.tar.gz$
+Disallow: /metrics
+Disallow: /*/*/blame/*
+Disallow: /*/*/log/*
+Disallow: /*/*/tree/*
+Disallow: /*/*/item/*
+Disallow: /*/*/mbox
+Disallow: /*/*/*/raw

A generate.sh => generate.sh +29 -0

@@ 0,0 1,29 @@
+#!/usr/bin/env bash
+
+[ -z "$1" ] && echo "no out file supplied" && exit 1
+out="$1"
+
+base="$2"
+
+# check if base was given
+if [ ! -z $base ]; then
+    # check if base exists
+    [ ! -e $base ] && echo "invalid base" && exit 1
+    # copy base to out
+    cp $base $out
+fi
+
+# loop lists
+for filename in ./lists/*.txt; do
+    # line by line
+    while read -r line; do
+        # ignore empty
+        [ -z "$line" ] && continue
+        # ignore comments
+        [[ $line =~ ^#.* ]] && continue
+
+        # add to out
+        echo "User-agent: $line" >> $out
+        echo "Disallow: /" >> $out
+    done < $filename
+done

A lists/ai.txt => lists/ai.txt +31 -0

@@ 0,0 1,31 @@
+360Spider
+
+AI2Bot
+Applebot
+Applebot-Extended
+AspiegelBot
+Bytespider
+CCBot
+ChatGPT-User
+Claude-Web
+ClaudeBot
+Diffbot
+FacebookBot
+GPTBot
+Google-Extended
+GoogleOther
+ImagesiftBot
+Kangaroo Bot
+Meta-ExternalFetcher
+OAI-SearchBot
+Omigili
+OmigiliBot
+PerplexityBot
+PerplexityBot
+PetalBot
+Timpibot
+Webzio-Extended
+YouBot
+
+anthropic-ai
+cohere-ai

A lists/marketingseo.txt => lists/marketingseo.txt +12 -0

@@ 0,0 1,12 @@
+AhrefsBot
+BLEXBot
+DataForSeoBot
+MJ12bot
+SemrushBot
+SemrushBot-SA
+ZoominfoBot
+
+barkrowler
+dotbot
+rogerbot
+serpstatbot

A lists/other.txt => lists/other.txt +13 -0

@@ 0,0 1,13 @@
+# Apparently doesn't respect *
+Seekport Crawler
+
+# Alexa, what is my current location?
+Amazonbot
+meta-externalagent
+
+# Meh
+turnitinbot
+Turnitin
+
+# 80legs crawler
+008

A lists/searchengine.txt => lists/searchengine.txt +2 -0

@@ 0,0 1,2 @@
+AlexandriaOrgBot
+Yandex

D robots.txt => robots.txt +0 -75

@@ 1,75 0,0 @@
-# TODO: Hahaa!
-#       Maybe disallow everything for... everything (/),
-#       but allow just google to index it?
-#       Then track the bot user-agents, and disallow manually those that don't respect *!
-
-
-
-# Parts of this are from the very well put together sr.ht/robots.txt,
-#  especially the part below regarding Sourcehut paths
-# Others are handpicked
-
-# For src.quest
-User-agent: *
-Disallow: /*?*
-Disallow: /*.tar.gz$
-Disallow: /metrics
-Disallow: /*/*/blame/*
-Disallow: /*/*/log/*
-Disallow: /*/*/tree/*
-Disallow: /*/*/item/*
-Disallow: /*/*/mbox
-Disallow: /*/*/*/raw
-
-# Marketing/SEO
-User-agent: SemrushBot
-Disallow: /
-User-agent: SemrushBot-SA
-Disallow: /
-User-agent: AhrefsBot
-Disallow: /
-User-agent: dotbot
-Disallow: /
-User-agent: rogerbot
-Disallow: /
-User-agent: BLEXBot
-Disallow: /
-User-agent: ZoominfoBot
-Disallow: /
-User-agent: Yandex
-Disallow: /
-User-agent: MJ12bot
-Disallow: /
-User-agent: DataForSeoBot
-Disallow: /
-User-agent: 
-Disallow: /
-
-# Not my jam
-User-agent: GPTBot
-Disallow: /
-
-# Some Huawei related AI services, no thanks
-User-agent: PetalBot
-Disallow: /
-User-agent: AspiegelBot
-Disallow: /
-
-# Alexa, nope
-User-agent: Amazonbot
-Disallow: /
-
-# Nnnnnope
-User-agent: turnitinbot
-Disallow: /
-User-agent: Turnitin
-Disallow: /
-
-# 80legs crawler
-User-agent: 008
-Disallow: /
-
-# Apparently doesn't respect *
-User-agent: Seekport Crawler
-Disallow: /
-