mhw pushed a commit to branch master in repository maintenance. commit 3cb736e37fb005233e71edff7249b9fbffdc7c2f Author: Mark H Weaver <m...@netris.org> Date: Tue Aug 8 04:18:03 2017 -0400
hydra: nginx: Block more web crawlers. * hydra/nginx/hydra.gnu.org.conf: Add blocks for the following bots: BehloolBot, Companybook-Crawler, DotBot, YandexBot, SemrushBot, PaperLiBot, and TwitterBot. --- hydra/nginx/hydra.gnu.org.conf | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/hydra/nginx/hydra.gnu.org.conf b/hydra/nginx/hydra.gnu.org.conf index 9cf5e15..e7df7f8 100644 --- a/hydra/nginx/hydra.gnu.org.conf +++ b/hydra/nginx/hydra.gnu.org.conf @@ -79,15 +79,10 @@ http { proxy_set_header X-Forwarded-Port $server_port; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - # XXX Block AhrefsBot, Baiduspider, Bing, SeznamBot, and - # Google. These search engines seem to disregard our robots.txt, + # XXX Block several bots that seem to disregard our robots.txt, # possibly because attempts to fetch robots.txt sometimes fails # due to gateway timeout :-( - # Also block ltx71.com, which accesses our pages ~30 times/hour - # with no apparent pattern, including our robots.txt which it - # disregards. They claim to be "scanning the internet for - # security research purposes." - if ($http_user_agent ~ "AhrefsBot|Baiduspider|bingbot|SeznamBot|ltx71.com|GoogleBot|Googlebot") { + if ($http_user_agent ~ "AhrefsBot|Baiduspider|bingbot|SeznamBot|BehloolBot|ltx71.com|GoogleBot|Googlebot|Companybook-Crawler|DotBot|YandexBot|SemrushBot|PaperLiBot|TwitterBot") { return 403; break; } @@ -130,7 +125,7 @@ http { proxy_set_header X-Forwarded-Port $server_port; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - if ($http_user_agent ~ "AhrefsBot|Baiduspider|bingbot|ltx71.com|GoogleBot|Googlebot") { + if ($http_user_agent ~ "AhrefsBot|Baiduspider|bingbot|SeznamBot|BehloolBot|ltx71.com|GoogleBot|Googlebot|Companybook-Crawler|DotBot|YandexBot|SemrushBot|PaperLiBot|TwitterBot") { return 403; break; }