From 6a060977b27b20fd69912b6a11132b831712016c Mon Sep 17 00:00:00 2001 From: Jonni Liljamo Date: Sat, 26 Oct 2024 21:09:04 +0300 Subject: [PATCH] feat: robots.txt generation --- .gitignore | 1 + README.md | 18 ++++++++++ bases/src.quest | 10 ++++++ generate.sh | 29 ++++++++++++++++ lists/ai.txt | 31 +++++++++++++++++ lists/marketingseo.txt | 12 +++++++ lists/other.txt | 13 ++++++++ lists/searchengine.txt | 2 ++ robots.txt | 75 ------------------------------------------ 9 files changed, 116 insertions(+), 75 deletions(-) create mode 100644 README.md create mode 100644 bases/src.quest create mode 100755 generate.sh create mode 100644 lists/ai.txt create mode 100644 lists/marketingseo.txt create mode 100644 lists/other.txt create mode 100644 lists/searchengine.txt delete mode 100644 robots.txt diff --git a/.gitignore b/.gitignore index c4a847d..81ce768 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ +/out/ /result diff --git a/README.md b/README.md new file mode 100644 index 0000000..b14fc28 --- /dev/null +++ b/README.md @@ -0,0 +1,18 @@ +# robots.txt + +This repository contains my very opinionated (and possibly overengineered) +robots.txt generation. + +`lists` contains lists of user agents to disallow at root + * .txt files, line by line listings of user agents + * empty lines are ignored + * lines starting with # are ignored + +`bases` contains site specific base robots.txt files + * base robots.txt files in the robots.txt format + * names represent the domains they're served at + +`generate.sh` is a bash script for generating output robots.txt files + * arg $1 is a required path to the out file + * arg $2 is an optional path to a base file + diff --git a/bases/src.quest b/bases/src.quest new file mode 100644 index 0000000..3c88561 --- /dev/null +++ b/bases/src.quest @@ -0,0 +1,10 @@ +User-agent: * +Disallow: /*?* +Disallow: /*.tar.gz$ +Disallow: /metrics +Disallow: /*/*/blame/* +Disallow: /*/*/log/* +Disallow: /*/*/tree/* +Disallow: /*/*/item/* +Disallow: /*/*/mbox +Disallow: /*/*/*/raw diff --git a/generate.sh b/generate.sh new file mode 100755 index 0000000..56e8783 --- /dev/null +++ b/generate.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +[ -z "$1" ] && echo "no out file supplied" && exit 1 +out="$1" + +base="$2" + +# check if base was given +if [ ! -z $base ]; then + # check if base exists + [ ! -e $base ] && echo "invalid base" && exit 1 + # copy base to out + cp $base $out +fi + +# loop lists +for filename in ./lists/*.txt; do + # line by line + while read -r line; do + # ignore empty + [ -z "$line" ] && continue + # ignore comments + [[ $line =~ ^#.* ]] && continue + + # add to out + echo "User-agent: $line" >> $out + echo "Disallow: /" >> $out + done < $filename +done diff --git a/lists/ai.txt b/lists/ai.txt new file mode 100644 index 0000000..8205848 --- /dev/null +++ b/lists/ai.txt @@ -0,0 +1,31 @@ +360Spider + +AI2Bot +Applebot +Applebot-Extended +AspiegelBot +Bytespider +CCBot +ChatGPT-User +Claude-Web +ClaudeBot +Diffbot +FacebookBot +GPTBot +Google-Extended +GoogleOther +ImagesiftBot +Kangaroo Bot +Meta-ExternalFetcher +OAI-SearchBot +Omigili +OmigiliBot +PerplexityBot +PerplexityBot +PetalBot +Timpibot +Webzio-Extended +YouBot + +anthropic-ai +cohere-ai diff --git a/lists/marketingseo.txt b/lists/marketingseo.txt new file mode 100644 index 0000000..7baed0e --- /dev/null +++ b/lists/marketingseo.txt @@ -0,0 +1,12 @@ +AhrefsBot +BLEXBot +DataForSeoBot +MJ12bot +SemrushBot +SemrushBot-SA +ZoominfoBot + +barkrowler +dotbot +rogerbot +serpstatbot diff --git a/lists/other.txt b/lists/other.txt new file mode 100644 index 0000000..3f2a4b9 --- /dev/null +++ b/lists/other.txt @@ -0,0 +1,13 @@ +# Apparently doesn't respect * +Seekport Crawler + +# Alexa, what is my current location? +Amazonbot +meta-externalagent + +# Meh +turnitinbot +Turnitin + +# 80legs crawler +008 diff --git a/lists/searchengine.txt b/lists/searchengine.txt new file mode 100644 index 0000000..4644474 --- /dev/null +++ b/lists/searchengine.txt @@ -0,0 +1,2 @@ +AlexandriaOrgBot +Yandex diff --git a/robots.txt b/robots.txt deleted file mode 100644 index 3772658..0000000 --- a/robots.txt +++ /dev/null @@ -1,75 +0,0 @@ -# TODO: Hahaa! -# Maybe disallow everything for... everything (/), -# but allow just google to index it? -# Then track the bot user-agents, and disallow manually those that don't respect *! - - - -# Parts of this are from the very well put together sr.ht/robots.txt, -# especially the part below regarding Sourcehut paths -# Others are handpicked - -# For src.quest -User-agent: * -Disallow: /*?* -Disallow: /*.tar.gz$ -Disallow: /metrics -Disallow: /*/*/blame/* -Disallow: /*/*/log/* -Disallow: /*/*/tree/* -Disallow: /*/*/item/* -Disallow: /*/*/mbox -Disallow: /*/*/*/raw - -# Marketing/SEO -User-agent: SemrushBot -Disallow: / -User-agent: SemrushBot-SA -Disallow: / -User-agent: AhrefsBot -Disallow: / -User-agent: dotbot -Disallow: / -User-agent: rogerbot -Disallow: / -User-agent: BLEXBot -Disallow: / -User-agent: ZoominfoBot -Disallow: / -User-agent: Yandex -Disallow: / -User-agent: MJ12bot -Disallow: / -User-agent: DataForSeoBot -Disallow: / -User-agent: -Disallow: / - -# Not my jam -User-agent: GPTBot -Disallow: / - -# Some Huawei related AI services, no thanks -User-agent: PetalBot -Disallow: / -User-agent: AspiegelBot -Disallow: / - -# Alexa, nope -User-agent: Amazonbot -Disallow: / - -# Nnnnnope -User-agent: turnitinbot -Disallow: / -User-agent: Turnitin -Disallow: / - -# 80legs crawler -User-agent: 008 -Disallow: / - -# Apparently doesn't respect * -User-agent: Seekport Crawler -Disallow: / - -- 2.44.1