diff --git a/default/Makefile.am b/default/Makefile.am index e1367bc20..c5da05e3b 100644 --- a/default/Makefile.am +++ b/default/Makefile.am @@ -28,7 +28,6 @@ nobase_nodist_default_DATA = \ nobase_default_DATA = \ auth.conf \ charset.conf \ - crawlers_detection.conf \ create_list.conf \ edit_list.conf \ ldap_alias_entry.tt2 \ diff --git a/default/crawlers_detection.conf b/default/crawlers_detection.conf deleted file mode 100644 index 753ed4ab5..000000000 --- a/default/crawlers_detection.conf +++ /dev/null @@ -1,260 +0,0 @@ -# Robots user agent string. Source http://www.useragentstring.com/pages/useragentstring.php (09/2007) - -user_agent_string ABACHOBot - -user_agent_string AbiLogicBot - -user_agent_string Accoona-AI-Agent - -user_agent_string AnyApexBot - -user_agent_string Arachmo - -user_agent_string B-l-i-t-z-B-O-T - -user_agent_string Baiduspider - -user_agent_string BecomeBot - -user_agent_string Bimbot - -user_agent_string BlitzBOT - -user_agent_string boitho.com-dc - -user_agent_string boitho.com-robot - -user_agent_string btbot - -user_agent_string Cerberian Drtrs - -user_agent_string ConveraCrawler - -user_agent_string cosmos - -user_agent_string DataparkSearch - -user_agent_string DiamondBot - -user_agent_string EmeraldShield.com \WebBot - -user_agent_string envolk[ITS]spider - -user_agent_string EsperanzaBot - -user_agent_string Exabot - -user_agent_string FAST Enterprise Crawler - -user_agent_string FAST-WebCrawler - -user_agent_string FDSE robot - -user_agent_string FindLinks - -user_agent_string FurlBot - -user_agent_string FyberSpider - -user_agent_string g2crawler - -user_agent_string Gaisbot - -user_agent_string genieBot - -user_agent_string Gigabot - -user_agent_string Girafabot - -user_agent_string Googlebot - -user_agent_string Googlebot-Image - -user_agent_string hl_ftien_spider - -user_agent_string htdig - -user_agent_string ia_archiver - -user_agent_string ichiro - -user_agent_string IRLbot - -user_agent_string IssueCrawler - -user_agent_string Java - -user_agent_string Jyxobot - -user_agent_string LapozzBot - -user_agent_string Larbin - -user_agent_string libwww-perl - -user_agent_string LinkWalker - -user_agent_string lmspider - -user_agent_string lwp-trivial - -user_agent_string mabontland - -user_agent_string Mediapartners-Google - -user_agent_string MJ12bot - -user_agent_string Mnogosearch - -user_agent_string mogimogi - -user_agent_string MojeekBot - -user_agent_string Morning Paper - -user_agent_string msnbot - -user_agent_string MSRBot - -user_agent_string MVAClient - -user_agent_string NetResearchServer - -user_agent_string NG-Search - -user_agent_string nicebot - -user_agent_string noxtrumbot - -user_agent_string Nusearch Spider - -user_agent_string NutchCVS - -user_agent_string obot - -user_agent_string oegp - -user_agent_string OmniExplorer_Bot - -user_agent_string Orbiter - -user_agent_string PageBitesHyperBot - -user_agent_string polybot - -user_agent_string Pompos - -user_agent_string Psbot - -user_agent_string PycURL - -user_agent_string Python-urllib - -user_agent_string RAMPyBot - -user_agent_string RufusBot - -user_agent_string SandCrawler - -user_agent_string SBIder - -user_agent_string Scrubby - -user_agent_string SearchSight - -user_agent_string Seekbot - -user_agent_string semanticdiscovery - -user_agent_string Sensis Web Crawler - -user_agent_string SEOChat::Bot - -user_agent_string Shim-Crawler - -user_agent_string ShopWiki - -user_agent_string Shoula robot - -user_agent_string silk - -user_agent_string Snappy - -user_agent_string sogou spider - -user_agent_string Speedy Spider - -user_agent_string Sqworm - -user_agent_string StackRambler - -user_agent_string SurveyBot - -user_agent_string SynooBot - -user_agent_string Teoma - -user_agent_string TerrawizBot - -user_agent_string TheSuBot - -user_agent_string Thumbnail.CZ robot - -user_agent_string TurnitinBot - -user_agent_string updated - -user_agent_string VoilaBot - -user_agent_string Vortex - -user_agent_string voyager - -user_agent_string VYU2 - -user_agent_string webcollage - -user_agent_string Websquash.com - -user_agent_string wf84 - -user_agent_string WoFindeIch Robot - -user_agent_string Xaldon_WebSpider - -user_agent_string yacy - -user_agent_string Yahoo! Slurp - -user_agent_string Yahoo! Slurp China - -user_agent_string YahooSeeker - -user_agent_string YahooSeeker-Testing - -user_agent_string yoogliFetchAgent - -user_agent_string Zao - -user_agent_string Zealbot - -user_agent_string zspider - -user_agent_string ZyBorg - -# OFFLINE BROWSERS - -user_agent_string Offline Explorer - -user_agent_string SuperBot - -user_agent_string Web Downloader - -user_agent_string WebCopier - -user_agent_string WebZIP - -# EmailSiphon - -user_agent_string EmailSiphon - diff --git a/doc/Makefile.am b/doc/Makefile.am index 7bf022fec..ffb6dfd11 100644 --- a/doc/Makefile.am +++ b/doc/Makefile.am @@ -54,7 +54,6 @@ man5_MANS = \ auth.conf.5 \ automatic_lists_description.conf.5 \ charset.conf.5 \ - crawlers_detection.conf.5 \ edit_list.conf.5 \ ldap_alias_manager.conf.5 \ list_config.5 \ @@ -71,7 +70,6 @@ EXTRA_DIST = $(nobase_doc_DATA) \ auth.conf.pod \ automatic_lists_description.conf.pod \ charset.conf.pod \ - crawlers_detection.conf.pod \ edit_list.conf.pod \ ldap_alias_manager.conf.pod \ list_config.pod \ diff --git a/doc/crawlers_detection.conf.pod b/doc/crawlers_detection.conf.pod deleted file mode 100644 index 7fe5a68b4..000000000 --- a/doc/crawlers_detection.conf.pod +++ /dev/null @@ -1,37 +0,0 @@ -=encoding utf-8 - -=head1 NAME - -crawlers_detection.conf - User agents to be excluded from session management - -=head1 DESCRIPTION - -F defines user agents to be excluded from session -management by Sympa web interface. - -TBD. - -=head1 FILES - -=over - -=item F<$DEFAULTDIR/crawlers_detection.conf> - -Distribution default. This file should not be edited. - -=item F<$SYSCONFDIR/Erobot nameE/crawlers_detection.conf> - -Configuration file for each robot. - -=back - -=head1 SEE ALSO - -L. - -=head1 HISTORY - -This document was initially written by IKEDA Soji . - -=cut - diff --git a/doc/sympa_config.podpl b/doc/sympa_config.podpl index 283148c74..9829fa660 100644 --- a/doc/sympa_config.podpl +++ b/doc/sympa_config.podpl @@ -578,7 +578,6 @@ L. L, L, -L, L, L, L, diff --git a/doc/sympa_toc.pod b/doc/sympa_toc.pod index 40a30cfa0..c1f662340 100644 --- a/doc/sympa_toc.pod +++ b/doc/sympa_toc.pod @@ -128,10 +128,6 @@ Configuration of authentication mechanisms for web interface of Sympa Configuration file for legacy character set support by Sympa -=item L - -User agents to be excluded from session management - =item L Configuration of privileges to edit list configuration diff --git a/src/cgi/wwsympa.fcgi.in b/src/cgi/wwsympa.fcgi.in index f7af680bd..b82868678 100644 --- a/src/cgi/wwsympa.fcgi.in +++ b/src/cgi/wwsympa.fcgi.in @@ -1746,27 +1746,21 @@ Sympa::Spool::Listmaster->instance->flush(purge => 1); ## Write to log sub wwslog { my $facility = shift; + my $msg = shift; - my $msg = shift; my $remote = $ENV{'REMOTE_HOST'} || $ENV{'REMOTE_ADDR'}; - my $wwsmsg = ''; - - $wwsmsg = "[list $param->{'list'}] " . $wwsmsg - if $param->{'list'}; - - $wwsmsg = "[user $param->{'user'}{'email'}] " . $wwsmsg - if $param->{'user'}{'email'}; - - $wwsmsg = "[rss] " . $wwsmsg - if $rss; - - $wwsmsg = "[client $remote] " . $wwsmsg - if $remote; - - $wwsmsg = "[session $session->{'id_session'}] " . $wwsmsg - if $session; - - $wwsmsg = "[robot $robot] " . $wwsmsg; + my $bot = $session->{'is_a_crawler'} || undef if $session; + + my $wwsmsg = join ' ', + grep {defined} ( + "[robot $robot]", + ($session and "[session $session->{'id_session'}]"), + ($remote and "[client $remote]"), + ($bot and "[bot <$bot>]"), + ($rss and "[rss]"), + ($param->{'user'}{'email'} and "[user $param->{'user'}{'email'}]"), + ($param->{'list'} and "[list $param->{'list'}]") + ); push @_, $wwsmsg; if ($msg =~ /^([(][^)]*[)])\s*(.*)/s) { @@ -8391,13 +8385,12 @@ sub do_arc { $param->{'file'} = $archive->{arc_directory} . '/' . $in{'arc_file'}; } - $param->{'date'} = Sympa::Tools::File::get_mtime( - $archive->{arc_directory} . '/' . $in{'arc_file'}); - # send page as static if client is a bot. That's prevent crawling all - # archices every weeks by google, yahoo and others bots - if ($session->{'is_a_crawler'}) { - $param->{'header_date'} = $param->{'date'}; - } + # Send page as static if client is a bot. That prevent crawling all + # archives every week by Google, Yahoo and other bots. + $param->{'header_date'} = + Sympa::Tools::File::get_mtime( + $archive->{arc_directory} . '/' . $in{'arc_file'}) + if $session->{'is_a_crawler'}; $param->{'archive_name'} = $in{'month'}; #test pour différentier les action d'un robot et d'un simple abonné diff --git a/src/lib/Conf.pm b/src/lib/Conf.pm index 338b7cd31..4a2a2d5fa 100644 --- a/src/lib/Conf.pm +++ b/src/lib/Conf.pm @@ -1151,35 +1151,9 @@ sub load_trusted_application { return load_generic_conf_file($config_file, \%trusted_applications); } -## load trusted_application.conf configuration file -sub load_crawlers_detection { - my $that = shift || '*'; - - my %crawlers_detection_conf = ( - 'user_agent_string' => { - 'occurrence' => '0-n', - 'format' => '.+' - } - ); - - my $config_file = - Sympa::search_fullpath($that, 'crawlers_detection.conf'); - return undef unless $config_file and -r $config_file; - my $hashtab = - load_generic_conf_file($config_file, \%crawlers_detection_conf); - my $hashhash; - - foreach my $kword (keys %{$hashtab}) { - # ignore comments and default - next - unless ($crawlers_detection_conf{$kword}); - foreach my $value (@{$hashtab->{$kword}}) { - $hashhash->{$kword}{$value} = 'true'; - } - } - - return $hashhash; -} +# load crawlers_detection.conf configuration file +# Deprecated. +#sub load_crawlers_detection; ############################################################ # load_generic_conf_file @@ -1645,8 +1619,6 @@ sub _load_server_specific_secondary_config_files { ## Load nrcpt_by_domain.conf $param->{'config_hash'}{'nrcpt_by_domain'} = load_nrcpt_by_domain(); - $param->{'config_hash'}{'crawlers_detection'} = - load_crawlers_detection($param->{'config_hash'}{'robot_name'}); } sub _infer_robot_parameter_values { diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am index 3147e7cff..06ecfbffc 100644 --- a/src/lib/Makefile.am +++ b/src/lib/Makefile.am @@ -236,6 +236,7 @@ nonCLI_modules = \ Sympa/Upgrade.pm \ Sympa/User.pm \ Sympa/WWW/Auth.pm \ + Sympa/WWW/Crawlers.pm \ Sympa/WWW/FastCGI.pm \ Sympa/WWW/Marc.pm \ Sympa/WWW/Marc/Search.pm \ diff --git a/src/lib/Sympa/WWW/Crawlers.pm b/src/lib/Sympa/WWW/Crawlers.pm new file mode 100644 index 000000000..493fc9202 --- /dev/null +++ b/src/lib/Sympa/WWW/Crawlers.pm @@ -0,0 +1,630 @@ +# -*- indent-tabs-mode: nil; -*- +# vim:ft=perl:et:sw=4 + +# Sympa - SYsteme de Multi-Postage Automatique + +# NOTE: This file is auto-generated. Don't edit it manually. +# Instead, modifications should be made on support/make_crawlers.pl file. + +package Sympa::WWW::Crawlers; + +use strict; +use warnings; + +use constant crawler => qr{ + ( + Googlebot\/ + | Googlebot-Mobile + | Googlebot-Image + | Googlebot-News + | Googlebot-Video + | AdsBot-Google([^-]|$) + | AdsBot-Google-Mobile + | Feedfetcher-Google + | Mediapartners-Google + | Mediapartners[ ]\(Googlebot\) + | APIs-Google + | Google-InspectionTool + | Storebot-Google + | GoogleOther + | bingbot + | Slurp + | [wW]get + | LinkedInBot + | Python-urllib + | python-requests + | aiohttp + | httpx + | libwww-perl + | httpunit + | Nutch + | Go-http-client + | phpcrawl + | msnbot + | jyxobot + | FAST-WebCrawler + | FAST[ ]Enterprise[ ]Crawler + | BIGLOTRON + | Teoma + | convera + | seekbot + | Gigabot + | Gigablast + | exabot + | ia_archiver + | GingerCrawler + | webmon[ ] + | HTTrack + | grub\.org + | UsineNouvelleCrawler + | antibot + | netresearchserver + | speedy + | fluffy + | findlink + | msrbot + | panscient + | yacybot + | AISearchBot + | ips-agent + | tagoobot + | MJ12bot + | woriobot + | yanga + | buzzbot + | mlbot + | yandex\.com\/bots + | purebot + | Linguee[ ]Bot + | CyberPatrol + | voilabot + | Baiduspider + | citeseerxbot + | spbot + | twengabot + | postrank + | Turnitin + | scribdbot + | page2rss + | sitebot + | linkdex + | Adidxbot + | ezooms + | dotbot + | Mail\.RU_Bot + | discobot + | heritrix + | findthatfile + | europarchive\.org + | NerdByNature\.Bot + | sistrix[ ]crawler + | Ahrefs(Bot|SiteAudit) + | fuelbot + | CrunchBot + | IndeedBot + | mappydata + | woobot + | ZoominfoBot + | PrivacyAwareBot + | Multiviewbot + | SWIMGBot + | Grobbot + | eright + | Apercite + | semanticbot + | Aboundex + | domaincrawler + | wbsearchbot + | summify + | CCBot + | edisterbot + | SeznamBot + | ec2linkfinder + | gslfbot + | aiHitBot + | intelium_bot + | facebookexternalhit + | Yeti + | RetrevoPageAnalyzer + | lb-spider + | Sogou + | lssbot + | careerbot + | wotbox + | wocbot + | ichiro + | DuckDuckBot + | lssrocketcrawler + | drupact + | webcompanycrawler + | acoonbot + | openindexspider + | gnam[ ]gnam[ ]spider + | web-archive-net\.com\.bot + | backlinkcrawler + | coccoc + | integromedb + | content[ ]crawler[ ]spider + | toplistbot + | it2media-domain-crawler + | ip-web-crawler\.com + | siteexplorer\.info + | elisabot + | proximic + | changedetection + | arabot + | WeSEE:Search + | niki-bot + | CrystalSemanticsBot + | rogerbot + | 360Spider + | psbot + | InterfaxScanBot + | CC[ ]Metadata[ ]Scaper + | g00g1e\.net + | GrapeshotCrawler + | urlappendbot + | brainobot + | fr-crawler + | binlar + | SimpleCrawler + | Twitterbot + | cXensebot + | smtbot + | bnf\.fr_bot + | A6-Indexer + | ADmantX + | Facebot + | OrangeBot\/ + | memorybot + | AdvBot + | MegaIndex + | SemanticScholarBot + | ltx71 + | nerdybot + | xovibot + | BUbiNG + | Qwantify + | archive\.org_bot + | Applebot + | TweetmemeBot + | crawler4j + | findxbot + | S[eE][mM]rushBot + | yoozBot + | lipperhey + | Y!J + | Domain[ ]Re-Animator[ ]Bot + | AddThis + | Screaming[ ]Frog[ ]SEO[ ]Spider + | MetaURI + | Scrapy + | Livelap[bB]ot + | OpenHoseBot + | CapsuleChecker + | collection\@infegy\.com + | IstellaBot + | DeuSu\/ + | betaBot + | Cliqzbot\/ + | MojeekBot\/ + | netEstate[ ]NE[ ]Crawler + | SafeSearch[ ]microdata[ ]crawler + | Gluten[ ]Free[ ]Crawler\/ + | Sonic + | Sysomos + | Trove + | deadlinkchecker + | Slack-ImgProxy + | Embedly + | RankActiveLinkBot + | iskanie + | SafeDNSBot + | SkypeUriPreview + | Veoozbot + | Slackbot + | redditbot + | datagnionbot + | Google-Adwords-Instant + | adbeat_bot + | WhatsApp + | contxbot + | pinterest\.com\/bot + | electricmonk + | GarlikCrawler + | BingPreview\/ + | vebidoobot + | FemtosearchBot + | Yahoo[ ]Link[ ]Preview + | MetaJobBot + | DomainStatsBot + | mindUpBot + | Daum\/ + | Jugendschutzprogramm-Crawler + | Xenu[ ]Link[ ]Sleuth + | Pcore-HTTP + | moatbot + | KosmioBot + | [pP]ingdom + | AppInsights + | PhantomJS + | Gowikibot + | PiplBot + | Discordbot + | TelegramBot + | Jetslide + | newsharecounts + | James[ ]BOT + | Bark[rR]owler + | TinEye + | SocialRankIOBot + | trendictionbot + | Ocarinabot + | epicbot + | Primalbot + | DuckDuckGo-Favicons-Bot + | GnowitNewsbot + | Leikibot + | LinkArchiver + | YaK\/ + | PaperLiBot + | Digg[ ]Deeper + | dcrawl + | Snacktory + | AndersPinkBot + | Fyrebot + | EveryoneSocialBot + | Mediatoolkitbot + | Luminator-robots + | ExtLinksBot + | SurveyBot + | NING\/ + | okhttp + | Nuzzel + | omgili + | PocketParser + | YisouSpider + | um-LN + | ToutiaoSpider + | MuckRack + | Jamie's[ ]Spider + | AHC\/ + | NetcraftSurveyAgent + | Laserlikebot + | ^Apache-HttpClient + | AppEngine-Google + | Jetty + | Upflow + | Thinklab + | Traackr\.com + | Twurly + | Mastodon + | http_get + | DnyzBot + | botify + | 007ac9[ ]Crawler + | BehloolBot + | BrandVerity + | check_http + | BDCbot + | ZumBot + | EZID + | ICC-Crawler + | ArchiveBot + | ^LCC[ ] + | filterdb\.iss\.net\/crawler + | BLP_bbot + | BomboraBot + | Buck\/ + | Companybook-Crawler + | Genieo + | magpie-crawler + | MeltwaterNews + | Moreover + | newspaper\/ + | ScoutJet + | (^|[ ])sentry\/ + | StorygizeBot + | UptimeRobot + | OutclicksBot + | seoscanners + | Hatena + | Google[ ]Web[ ]Preview + | MauiBot + | AlphaBot + | SBL-BOT + | IAS[ ]crawler + | adscanner + | Netvibes + | acapbot + | Baidu-YunGuanCe + | bitlybot + | blogmuraBot + | Bot\.AraTurka\.com + | bot-pge\.chlooe\.com + | BoxcarBot + | BTWebClient + | ContextAd[ ]Bot + | Digincore[ ]bot + | Disqus + | Feedly + | Fetch\/ + | Fever + | Flamingo_SearchEngine + | FlipboardProxy + | g2reader-bot + | G2[ ]Web[ ]Services + | imrbot + | K7MLWCBot + | Kemvibot + | Landau-Media-Spider + | linkapediabot + | vkShare + | Siteimprove\.com + | BLEXBot\/ + | DareBoost + | ZuperlistBot\/ + | Miniflux\/ + | Feedspot + | Diffbot\/ + | SEOkicks + | tracemyfile + | Nimbostratus-Bot + | zgrab + | PR-CY\.RU + | AdsTxtCrawler + | Datafeedwatch + | Zabbix + | TangibleeBot + | google-xrawler + | axios + | Amazon[ ]CloudFront + | Pulsepoint + | CloudFlare-AlwaysOnline + | Google-Structured-Data-Testing-Tool + | WordupInfoSearch + | WebDataStats + | HttpUrlConnection + | Seekport[ ]Crawler + | ZoomBot + | VelenPublicWebCrawler + | MoodleBot + | jpg-newsbot + | outbrain + | W3C_Validator + | Validator\.nu + | W3C-checklink + | W3C-mobileOK + | W3C_I18n-Checker + | FeedValidator + | W3C_CSS_Validator + | W3C_Unicorn + | Google-PhysicalWeb + | Blackboard + | ICBot\/ + | BazQux + | Twingly + | Rivva + | Experibot + | awesomecrawler + | Dataprovider\.com + | GroupHigh\/ + | theoldreader\.com + | AnyEvent + | Uptimebot\.org + | Nmap[ ]Scripting[ ]Engine + | 2ip\.ru + | Clickagy + | Caliperbot + | MBCrawler + | online-webceo-bot + | B2B[ ]Bot + | AddSearchBot + | Google[ ]Favicon + | HubSpot + | Chrome-Lighthouse + | HeadlessChrome + | CheckMarkNetwork\/ + | www\.uptime\.com + | Streamline3Bot\/ + | serpstatbot\/ + | MixnodeCache\/ + | ^curl + | SimpleScraper + | RSSingBot + | Jooblebot + | fedoraplanet + | Friendica + | NextCloud + | Tiny[ ]Tiny[ ]RSS + | RegionStuttgartBot + | Bytespider + | Datanyze + | Google-Site-Verification + | TrendsmapResolver + | tweetedtimes + | NTENTbot + | Gwene + | SimplePie + | SearchAtlas + | Superfeedr + | feedbot + | UT-Dorkbot + | Amazonbot + | SerendeputyBot + | Eyeotabot + | officestorebot + | Neticle[ ]Crawler + | SurdotlyBot + | LinkisBot + | AwarioSmartBot + | AwarioRssBot + | RyteBot + | FreeWebMonitoring[ ]SiteChecker + | AspiegelBot + | NAVER[ ]Blog[ ]Rssbot + | zenback[ ]bot + | SentiBot + | Domains[ ]Project\/ + | Pandalytics + | VKRobot + | bidswitchbot + | tigerbot + | NIXStatsbot + | Atom[ ]Feed[ ]Robot + | [Cc]urebot + | PagePeeker\/ + | Vigil\/ + | rssbot\/ + | startmebot\/ + | JobboerseBot + | seewithkids + | NINJA[ ]bot + | Cutbot + | BublupBot + | BrandONbot + | RidderBot + | Taboolabot + | Dubbotbot + | FindITAnswersbot + | infoobot + | Refindbot + | BlogTraffic\/\d\.\d+[ ]Feed-Fetcher + | SeobilityBot + | Cincraw + | Dragonbot + | VoluumDSP-content-bot + | FreshRSS + | BitBot + | ^PHP-Curl-Class + | Google-Certificates-Bridge + | centurybot + | Viber + | e\.ventures[ ]Investment[ ]Crawler + | evc-batch + | PetalBot + | virustotal + | (^|[ ])PTST\/ + | minicrawler + | Cookiebot + | trovitBot + | seostar\.co + | IonCrawl + | Uptime-Kuma + | SeekportBot + | FreshpingBot + | Feedbin + | CriteoBot + | Snap[ ]URL[ ]Preview[ ]Service + | Better[ ]Uptime[ ]Bot + | RuxitSynthetic + | Google-Read-Aloud + | Valve\/Steam + | OdklBot\/ + | GPTBot + | ChatGPT-User + | YandexRenderResourcesBot\/ + | LightspeedSystemsCrawler + | ev-crawler\/ + | BitSightBot\/ + | woorankreview\/ + | Google-Safety + | AwarioBot + | DataForSeoBot + | Linespider + | WellKnownBot + | A[ ]Patent[ ]Crawler + | StractBot + | search\.marginalia\.nu + | YouBot + | Nicecrawler + | Neevabot + | BrightEdge[ ]Crawler + | SiteCheckerBotCrawler + | TombaPublicWebCrawler + | CrawlyProjectCrawler + | KomodiaBot + | KStandBot + | CISPA[ ]Webcrawler + | MTRobot + | hyscore\.io + | AlexandriaOrgBot + | 2ip[ ]bot + | Yellowbrandprotectionbot + | SEOlizer + | vuhuvBot + | INETDEX-BOT + | Synapse + | t3versionsBot + | deepnoc + | Cocolyzebot + | hypestat + | ReverseEngineeringBot + | sempi\.tech + | Iframely + | MetaInspector + | node-fetch + | lkxscan + | python-opengraph + | OpenGraphCheck + | developers\.google\.com\/\+\/web\/snippet + | SenutoBot + | MaCoCu + | NewsBlur + | inoreader + | NetSystemsResearch + | PageThing + | WordPress\/ + | PhxBot + | ImagesiftBot + | Expanse + | InternetMeasurement + | ^BW\/ + | GeedoBot + | Audisto[ ]Crawler + | PerplexityBot\/ + | [cC]laude[bB]ot + | Monsidobot + | GroupMeBot + ) +}x; + +1; + +__END__ +=encoding utf-8 + +=head1 NAME + +Sympa::WWW::Crawlers - Regular expression for User-Agent of web crawlers + +=head1 DESCRIPTION + +This module keeps definition of regular expressions used by Sympa software. + +The regular expression is generated from the data provided by the +project below. + +=head1 SEE ALSO + +=over + +=item * + +Syntactic patterns of HTTP user-agents used by bots / robots / crawlers / +scrapers / spiders + +L + +=back + + +=head1 HISTORY + +Crawler detection feature of WWSympa was introduced on Sympa 5.4a.4 +which derives information provided by L. + +On Sympa 6.2.74, it was replaced with regular expression matching +using information provided by crawler-user-agents project above. + +=cut diff --git a/src/lib/Sympa/WWW/Session.pm b/src/lib/Sympa/WWW/Session.pm index 797768427..9bca282fd 100644 --- a/src/lib/Sympa/WWW/Session.pm +++ b/src/lib/Sympa/WWW/Session.pm @@ -38,6 +38,7 @@ use Sympa::Language; use Sympa::Log; use Sympa::Tools::Data; use Sympa::Tools::Password; +use Sympa::WWW::Crawlers; # this structure is used to define which session attributes are stored in a # dedicated database col where others are compiled in col 'data_session' @@ -78,12 +79,10 @@ sub new { # passive_session are session not stored in the database, they are used # for crawler bots and action such as css, wsdl, ajax and rss - if (_is_a_crawler($robot)) { - $self->{'is_a_crawler'} = 1; - $self->{'passive_session'} = 1; - } + $self->{'is_a_crawler'} = _is_a_crawler($robot); $self->{'passive_session'} = 1 - if $rss + if $self->{'is_a_crawler'} + or $rss or $action and ($action eq 'wsdl' or $action eq 'css'); # if a session cookie exist, try to restore an existing session, don't @@ -640,16 +639,17 @@ sub _generic_get_cookie { # DEPRECATED: No longer used. #sub check_cookie_extern; -# input user agent string and IP. return 1 if suspected to be a crawler. -# initial version based on rawlers_dtection.conf file only -# later : use Session table to identify those who create a lot of sessions +# input user agent string. return 1 if suspected to be a crawler. #FIXME: Robot context is ignored. +my $crawler_re = Sympa::WWW::Crawlers::crawler(); + sub _is_a_crawler { my $robot = shift; my $ua = $ENV{'HTTP_USER_AGENT'}; return undef unless defined $ua; - return $Conf::Conf{'crawlers_detection'}{'user_agent_string'}{$ua}; + return undef unless $ua =~ $crawler_re; + return $1 || '?'; } sub confirm_action { diff --git a/support/README.support.md b/support/README.support.md index fe35d29cd..10b934516 100644 --- a/support/README.support.md +++ b/support/README.support.md @@ -15,6 +15,13 @@ Initially taken from repository of rsync https://git.samba.org/?p=rsync.git;a=history;f=support/git-set-file-times at 2009-01-13, and made modifications. +### make_crawlers.pl + +Generates `Sympa/WWW/Crawlers.pm` file, by running as: +``` +make_crawlers.pl -o $MODULEDIR/Sympa/WWW/Crawlers.pm +``` + ### pod2md Converts POD data to Markdown format. This may be used as a replacement of diff --git a/support/make_crawlers.pl b/support/make_crawlers.pl new file mode 100755 index 000000000..b20395600 --- /dev/null +++ b/support/make_crawlers.pl @@ -0,0 +1,108 @@ +#!/usr/bin/env perl +# -*- indent-tabs-mode: nil; -*- +# vim:ft=perl:et:sw=4 + +use strict; +use warnings; +use English qw(-no_match_vars); +use Getopt::Long; +use JSON qw(); +use LWP::Simple qw(); + +use constant crawlers_url => + 'https://raw.githubusercontent.com/monperrus/crawler-user-agents/master/crawler-user-agents.json'; + +my %opts; +GetOptions(\%opts, 'output|o=s') or exit 1; + +my $crawlers = JSON->new->decode(LWP::Simple::get(crawlers_url())); +die "No content.\n" unless ref $crawlers eq 'ARRAY'; + +my @patterns = map { + if (ref $_ eq 'HASH' and defined $_->{pattern}) { + ($_->{pattern} =~ s/([ #{}])/[$1]/gr =~ s/\@/\\\@/gr =~ + s/(? }, join "\n | ", @patterns; +eval $output; +$EVAL_ERROR and die "$EVAL_ERROR\n"; + +my $fh; +if ($opts{output}) { + if ($opts{output} eq '-') { + $fh = *STDOUT; + } else { + open $fh, '>', $opts{output} or die "$ERRNO\n"; + } +} else { + my $dir = `dirname $0`; + chomp $dir; + open $fh, '>', "$dir/../src/lib/Sympa/WWW/Crawlers.pm" + or die "$ERRNO\n"; +} +print $fh $output; + +__END__ +# -*- indent-tabs-mode: nil; -*- +# vim:ft=perl:et:sw=4 + +# Sympa - SYsteme de Multi-Postage Automatique + +# NOTE: This file is auto-generated. Don't edit it manually. +# Instead, modifications should be made on support/make_crawlers.pl file. + +package Sympa::WWW::Crawlers; + +use strict; +use warnings; + +use constant crawler => qr{ + ( + %s + ) +}x; + +1; + +__END__ +=encoding utf-8 + +=head1 NAME + +Sympa::WWW::Crawlers - Regular expression for User-Agent of web crawlers + +=head1 DESCRIPTION + +This module keeps definition of regular expressions used by Sympa software. + +The regular expression is generated from the data provided by the +project below. + +=head1 SEE ALSO + +=over + +=item * + +Syntactic patterns of HTTP user-agents used by bots / robots / crawlers / +scrapers / spiders + +L + +=back + + +=head1 HISTORY + +Crawler detection feature of WWSympa was introduced on Sympa 5.4a.4 +which derives information provided by L. + +On Sympa 6.2.74, it was replaced with regular expression matching +using information provided by crawler-user-agents project above. + +=cut