From c7a6ecb34691abda06c31dc149a66178bf6eebba Mon Sep 17 00:00:00 2001 From: Michael Date: Fri, 10 May 2024 09:33:36 +0000 Subject: [PATCH] Blockbot: More agents added --- blockbot/blockbot.php | 138 ++++++++++++++++++++++++++++++++---------- 1 file changed, 106 insertions(+), 32 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 77886edf..e5de1ef3 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -62,14 +62,14 @@ function blockbot_init_1() $parts = blockbot_get_parts($_SERVER['HTTP_USER_AGENT']); - $logdata = ['isCrawler' => $isCrawler, 'agent' => $_SERVER['HTTP_USER_AGENT'], 'uri' => $_SERVER['REQUEST_URI'], 'parts' => $parts]; + $logdata = ['isCrawler' => $isCrawler, 'agent' => $_SERVER['HTTP_USER_AGENT'], 'method' => $_SERVER['REQUEST_METHOD'], 'uri' => $_SERVER['REQUEST_URI'], 'parts' => $parts]; if ($isCrawler) { blockbot_check_login_attempt($_SERVER['REQUEST_URI'], $logdata); } if (empty($parts)) { - Logger::debug('Known frontend found', $logdata); + Logger::debug('Known frontend found - accept', $logdata); if ($isCrawler) { blockbot_save('badly-parsed-agents', $_SERVER['HTTP_USER_AGENT']); } @@ -77,75 +77,77 @@ function blockbot_init_1() } if (blockbot_is_crawler($parts)) { - Logger::debug('Crawler found', $logdata); + Logger::debug('Crawler found - reject', $logdata); blockbot_reject(); } if (blockbot_is_searchbot($parts)) { - Logger::debug('Search bot found', $logdata); + Logger::debug('Search bot found - reject', $logdata); blockbot_reject(); } if (blockbot_is_unwanted($parts)) { - Logger::debug('Uncategorized unwanted agent found', $logdata); + Logger::debug('Uncategorized unwanted agent found - reject', $logdata); blockbot_reject(); } if (blockbot_is_security_checker($parts)) { - Logger::debug('Security checker found', $logdata); if (!DI::config()->get('blockbot', 'security_checker')) { + Logger::debug('Security checker found - reject', $logdata); blockbot_reject(); } + Logger::debug('Security checker found - accept', $logdata); return; } if (blockbot_is_social_media($parts)) { - Logger::debug('Social media service found', $logdata); + Logger::debug('Social media service found - accept', $logdata); return; } if (blockbot_is_fediverse_client($parts)) { - Logger::debug('Fediverse client found', $logdata); + Logger::debug('Fediverse client found - accept', $logdata); return; } if (blockbot_is_feed_reader($parts)) { - Logger::debug('Feed reader found', $logdata); + Logger::debug('Feed reader found - accept', $logdata); return; } if (blockbot_is_fediverse_tool($parts)) { - Logger::debug('Fediverse tool found', $logdata); + Logger::debug('Fediverse tool found - accept', $logdata); return; } if (blockbot_is_service_agent($parts)) { - Logger::debug('Service agent found', $logdata); + Logger::debug('Service agent found - accept', $logdata); return; } if (blockbot_is_monitor($parts)) { - Logger::debug('Monitoring service found', $logdata); + Logger::debug('Monitoring service found - accept', $logdata); return; } if (blockbot_is_validator($parts)) { - Logger::debug('Validation service found', $logdata); + Logger::debug('Validation service found - accept', $logdata); return; } if (blockbot_is_good_tool($parts)) { - Logger::debug('Uncategorized helpful service found', $logdata); + Logger::debug('Uncategorized helpful service found - accept', $logdata); return; } // Needs to be checked at the end, since other services might use these libraries if (blockbot_is_http_library($parts)) { blockbot_check_login_attempt($_SERVER['REQUEST_URI'], $logdata); - Logger::debug('HTTP Library found', $logdata); if (!DI::config()->get('blockbot', 'http_libraries')) { + Logger::debug('HTTP Library found - reject', $logdata); blockbot_reject(); } + Logger::debug('HTTP Library found - accept', $logdata); return; } @@ -161,7 +163,7 @@ function blockbot_init_1() } blockbot_save('bad-agents', $_SERVER['HTTP_USER_AGENT']); - Logger::notice('Blocked bot', $logdata); + Logger::notice('Possible bot found - reject', $logdata); blockbot_reject(); } @@ -182,7 +184,7 @@ function blockbot_save($database, $userAgent) function blockbot_check_login_attempt(string $url, array $logdata) { if (in_array(trim(parse_url($url, PHP_URL_PATH), '/'), ['login', 'lostpass', 'register'])) { - Logger::debug('Login attempt detected', $logdata); + Logger::debug('Login attempt detected - reject', $logdata); blockbot_reject(); } } @@ -198,9 +200,9 @@ function blockbot_is_unwanted(array $parts): bool $agents = [ 'oii-research', 'yisouspider', 'bots.retroverse.social', 'gaisbot', 'bloglines', 'emailwolf', 'webtech', 'facebookscraper', 'www.ecsl.cs.sunysb.edu/~maxim/cgi-bin/link', - 'gulper', 'magellan', 'linkcheck', 'nerdybot', 'ms search robot', 'fast-webcrawler', + 'gulper', 'magellan', 'linkcheck', 'nerdybot', 'ms search robot', 'fast-webcrawler', 'yioopbot', 'webster', 'www.admantx.com', 'openhosebot', 'lssrocketcrawler', 'dow jones searchbot', - 'gomezagent', 'domainsigmacrawler', 'netseer crawler', 'gptbot', 'superbot', 'searchexpress', + 'gomezagent', 'domainsigmacrawler', 'netseer crawler', 'superbot', 'searchexpress', 'alittle client', 'amazon-kendra', 'scanner.ducks.party', 'isscyberriskcrawler', 'google wireless transcoder', ]; @@ -221,6 +223,18 @@ function blockbot_is_unwanted(array $parts): bool */ function blockbot_is_crawler(array $parts): bool { + $agents = [ + '+http://yourls.org', 'adbeat.com/policy', 'https://gtmetrix.com', 'hubspot', 'nutch-', + 'openwebspider' + ]; + foreach ($parts as $part) { + foreach ($agents as $agent) { + if (strpos($part, $agent) !== false) { + return true; + } + } + } + $agents = [ 'ahrefsbot', 'pinterest', 'proximic', 'applebot', 'synapseworkstation.3.2.1', 'slackbot-linkexpanding', 'semrushbot-sa', 'qwantify', 'google search console', @@ -241,8 +255,26 @@ function blockbot_is_crawler(array $parts): bool 'nuzzel', 'boardreader blog indexer', 'hatena-favicon', 'nbertaupete95', 'scrapy', "electronic frontier foundation's do not track verifier", 'synapse', 'trendsmapresolver', 'pinterestbot', 'um-ln', 'slack-imgproxy', 'diffbot', 'dataforseobot', 'bw', 'bitlybot', - 'twingly recon-klondike', 'imagesiftbot', 'google', 'rogerbot', 'yahoocachesystem', - 'vkshare', 'appid: s~virustotalcloud', 'clickagy intelligence bot v2', + 'twingly recon-klondike', 'imagesiftbot', 'rogerbot', 'yahoocachesystem', 'favicon', + 'vkshare', 'appid: s~virustotalcloud', 'clickagy intelligence bot v2', 'gptbot', + 'archive.org_bot http://archive.org/details', 'wellknownbot', 'archiveteam archivebot', + 'megaindex.ru', 'adbeat_bot', 'masscan', 'embedly', 'cloudflare-amp', 'exabot-thumbnails', + 'yahoo ad monitoring', 'seokicks-robot', 'trendiction search', 'semrushbot-si', 'plukkie', + 'hubpages v0.2.2', 'aream.bot', 'safednsbot', 'linkpadbot', 'gluten free crawler', + 'turnitinbot', 'xovibot', 'domaincrawler', 'nettrack', 'domaincrawler', 'yak', 'bubing', + 'netestate ne crawler', 'blexbot', 'the knowledge ai', 'optimizer', 'hubspot webcrawler', + 'venuscrawler', 'adstxtcrawler', 'iframely', 'checkmarknetwork', 'semrushbot-ba', + 'archive.org bot', 'aihitbot', 'sitesucker', 'adstxtlab.com crawler', 'jobboersebot', + 'http://www.archive.org/details/archive.org_bot', 'heritrix', 'appid: s~snapchat-proxy', + 'icc-crawler', 'mbcrawler', 'slackbot', 'trumind-crawler', 'newspaper', 'online-webceo-bot', + 'haena-pepper', 'y! crawler', 'linkwalker', 'seznamemailproxy', 'seekport crawler', + 'domainstatsbot', 'qwantify/mermoz', 'sprinklr', 'komodiabot', 'seoscanners.net', + 'domainappender', 'mixrankbot', 'abonti', 'urlappendbot', 'sistrix crawler', + 'hatenabookmark', 'metainspector', 'ezooms', 'quora link preview', 'semrushbot-bm', + 'barkrowler', 'panscient.com', 'http://tweetedtimes.com', 'twingly recon', + 'collection@infegy.com', 'mediatoolkitbot', 'cloudflare-amphtml', 'ramblermail', + 'tineye', 'adscanner', 'datagnionbot', 'aa_crawler', 'http://www.profound.net/domainappender', + 'appid: e~arsnova-filter-system', 'kinglandsystemscorp', 'crmnlcrawlagent', 'techfetch-bot', ]; foreach ($parts as $part) { @@ -265,11 +297,20 @@ function blockbot_is_crawler(array $parts): bool */ function blockbot_is_searchbot(array $parts): bool { + $agents = ['baiduspider']; + foreach ($parts as $part) { + foreach ($agents as $agent) { + if (strpos($part, $agent) !== false) { + return true; + } + } + } + $agents = [ 'yahoo! slurp', 'linkcheck by siteimprove.com', 'googlebot', '360spider', 'haosouspider', 'mj12bot', 'feedfetcher-google', 'mediapartners-google', 'duckduckgo-favicons-bot', 'googlebot-mobile', 'gigablastopensource', 'bingbot', 'surveybot', 'yandexbot', - 'baiduspider', 'google web preview', 'meanpathbot', 'wesee_bot:we_help_monitize_your_site', + 'google web preview', 'meanpathbot', 'wesee_bot:we_help_monitize_your_site', 'seznambot', 'sogou web spider', 'linkdexbot', 'msnbot', 'smtbot', 'yandexmetrika', 'google-site-verification', 'netcraft ssl server survey - contact info@netcraft.com', 'orangebot', 'google-adwords-instant', 'googlebot-richsnippets', 'google-lens', @@ -281,6 +322,15 @@ function blockbot_is_searchbot(array $parts): bool 'googleassociationservice', 'yandexwebmaster', 'yacybot', 'duckduckbot-https', 'yandexmobilebot', 'mail.ru_bot/fast', 'yandeximages', 'mail.ru_bot/img', 'ia_archiver', 'yandexblogs', 'yandexaccessibilitybot', 'yandeximageresizer', 'mail.ru_bot', 'yeti', 'obot', 'baiduspider-render', + 'netcraft web server survey', 'yandexnews', 'google', 'yandexrenderresourcesbot', + 'match by siteimprove.com', 'yandexsitelinks', 'yandexantivirus', 'daum', 'mail.ru_bot/robots', + 'yandexmedia', 'msnbot-products', 'yandexvideo', 'yandexvertis', 'catexplorador', 'yandexcalendar', + 'yandexfavicons', 'user-agent\x09baiduspider', 'baiduspider-image', 'yandexpagechecker', 'mojeekbot', + 'adsbot-google-mobile', 'google-adwords-displayads-webrender', 'seznam screenshot-generator', + 'yandexscreenshotbot', 'zumbot', 'tracemyfile', 'wotbox', 'google-adwords-express', + 'google-adwords-displayads', 'google-youtube-links', 'yandexvideoparser', 'paperlibot', + 'weborama-fetcher', 'googleproducer', 'coccoc', 'acoonbot', 'psbot', 'sosospider', 'voilabot', + 'blekkobot', 'easouspider', 'omgili', 'yadirectfetcher', 'sogou pic spider', 'daumoa', ]; foreach ($parts as $part) { @@ -304,7 +354,7 @@ function blockbot_is_security_checker(array $parts): bool 'bitsightbot', 'censysinspect', 'pathspider', 'repolookoutbot', 'sqlmap', 'ltx71', 'netsystemsresearch studies the availability of various services across the internet. our website is netsystemsresearch.com', 'expanse a palo alto networks company searches across the global ipv4 space multiple times per day to identify customers'', - 'zgrab', 'nmap scripting engine', 'l9scan', 'riddler', + 'zgrab', 'nmap scripting engine', 'l9scan', 'riddler', 'cloud mapping experiment. contact research@pdrlabs.net', ]; foreach ($parts as $part) { @@ -324,7 +374,8 @@ function blockbot_is_security_checker(array $parts): bool function blockbot_is_validator(array $parts): bool { $agents = [ - 'jigsaw', 'ssl labs', 'w3c_validator', 'w3c-checklink', 'p3p validator', 'csscheck', + 'jigsaw', 'ssl labs', 'w3c_validator', 'w3c-checklink', 'p3p validator', 'csscheck', 'validator.nu', + 'google-structured-data-testing-tool https://search.google.com/structured-data', 'w3c_unicorn', ]; foreach ($parts as $part) { @@ -345,7 +396,9 @@ function blockbot_is_monitor(array $parts): bool { $agents = [ 'alexa site audit', 'catchpoint', 'google page speed insights', 'checkhost', - 'poduptime', 'chrome-lighthouse', 'zabbix', 'cloudflare-alwaysonline', + 'poduptime', 'chrome-lighthouse', 'zabbix', 'cloudflare-alwaysonline', 'ptst', + 'pingadmin.ru', 'pingdomtms', 'nimbostratus-bot', 'uptimebot', 'uptimerobot', + 'http://notifyninja.com/monitoring', 'http://www.freewebmonitoring.com', ]; foreach ($parts as $part) { @@ -382,14 +435,14 @@ function blockbot_is_social_media(array $parts): bool 'firefish', 'activity-relay', 'juick', 'camo', 'python/federation', 'nextcloud', 'snac', 'bovine', 'takahe', 'freedica', 'gnu social', 'microblogpub', 'mbin', 'mammoth', 'kbinbot', 'honksnonk', 'misskeymediaproxy', 'kbinbot', 'jistflow', - 'mastodon/3.4.1 fedibird', 'fedibird', 'funkwhale', + 'mastodon/3.4.1 fedibird', 'fedibird', 'funkwhale', 'linkedinbot', 'wafrn-cache-generator', 'simple social network', 'mbinbot', 'wordpress.com', 'catnip', 'castopod', 'enby-town', 'vernissage', 'iceshrimp.net', 'plasmatrap', 'imgproxy', 'rustypub', 'flipboard activitypub', 'gnu social activitypub plugin', 'micro.blog', 'mastodon-bookmark-rss', 'bookwyrm', 'damus', 'primal', 'misskeyadmin', 'ruby, mastodon', 'nextcloud social', 'camo asset proxy', 'smithereen', 'sorasns', 'cherrypick', 'bonfire activitypub federation', 'upub+0.1.0', 'plume', 'incestoma', - 'gyptazyfedi', 'apogee', 'quolibet', + 'gyptazyfedi', 'apogee', 'quolibet', 'magpie-crawler', 'redditbot', 'facebookplatform', ]; foreach ($parts as $part) { @@ -441,7 +494,7 @@ function blockbot_is_feed_reader(array $parts): bool 'windows-rss-platform', 'feedshow', 'feedreader', 'rssbandit', 'everyfeed-spider', 'feeeed', 'spacecowboys android rss reader', 'gregarius', 'feedspot', 'feedspot ssl asset proxy', 'newsgator', 'newsgator fetchlinks extension', - 'akregator', + 'akregator', 'appid: s~feedly-nikon3', ]; foreach ($parts as $part) { @@ -482,11 +535,21 @@ function blockbot_is_fediverse_tool(array $parts): bool */ function blockbot_is_service_agent(array $parts): bool { + $agents = ['wordpress.com']; + foreach ($parts as $part) { + foreach ($agents as $agent) { + if (strpos($part, $agent) !== false) { + return true; + } + } + } + $agents = [ 'chrome privacy preserving prefetch proxy', 'http compression test', 'microsoftpreview', 'pocketimagecache', 'wordpress', 'skypeuripreview preview', 'wordpress.com', 'discordbot', 'summalybot', 'livelapbot', 'whatsapp', 'facebot', 'skypeuripreview', - 'plasmatrap image proxy server', 'grammarly', + 'plasmatrap image proxy server', 'grammarly', 'browsershots', 'google-apps-script', + 'yahoomailproxy', 'pocketparser', 'apachebench', ]; foreach ($parts as $part) { @@ -505,10 +568,19 @@ function blockbot_is_service_agent(array $parts): bool */ function blockbot_is_http_library(array $parts): bool { - if ((count($parts) == 1) && in_array($parts[0], ['okhttp', 'useragent'])) { + if ((count($parts) == 1) && in_array($parts[0], ['okhttp', 'useragent', 'faraday'])) { return true; } + $agents = ['faraday ']; + foreach ($parts as $part) { + foreach ($agents as $agent) { + if (strpos($part, $agent) !== false) { + return true; + } + } + } + $agents = [ 'python-urllib', 'go-http-client', 'axios', 'java', 'undici', 'node', 'ruby', 'mint', 'wget', 'dart:io', 'dart', 'caveman-sieve', 'guzzlehttp', 'deno', @@ -522,7 +594,9 @@ function blockbot_is_http_library(array $parts): bool 'cpp-httplib', 'fuzz faster u fool v1.3.1-dev', 'fuzz faster u fool v1.5.0-dev', 'go http package', 'go-resty', 'http.rb', 'ivre-masscan', 'java1.0.21.0', 'jsdom', 'python-urllib3', 'reactornetty', 'req', 'restsharp', 'ruby-rdf-distiller', - 'pycurl', + 'pycurl', 'fdm', 'fdmx', 'lua-resty-http', 'python-httplib2', 'anyevent-http', + 'node-superagent', 'unirest-java', 'gvfs', 'http_request2', 'java browser', 'cakephp', + 'curly http client', 'lavf', 'typhoeus', ]; foreach ($parts as $part) { @@ -544,7 +618,7 @@ function blockbot_is_good_tool(array $parts): bool $agents = [ 'easy-feed-oven', 'cutycapt', 'rss-is-dead.lol web bot', 'dnt-policy@eff.org', 'https://socnetv.org', 'opengraphreader', 'trendfetcher', 'iabot', 'rss-is-dead.lol feed bot', - 'androiddownloadmanager', 'readybot.io', 'hydra', 'httrack', 'vlc', 'wellknownbot', 'wdg_validator', 'download demon', + 'androiddownloadmanager', 'readybot.io', 'hydra', 'httrack', 'vlc', 'wdg_validator', 'download demon', ];