From 9db8c0c646eef72de9b42694e7de88eba6e975f4 Mon Sep 17 00:00:00 2001
From: IKEDA Soji <ikeda@conversion.co.jp>
Date: Wed, 29 Nov 2023 16:21:45 +0900
Subject: [PATCH 1/3] Crawler detection is replaced with regular expression
 matching using new information source:
 https://github.com/monperrus/crawler-user-agents

---
 default/Makefile.am             |   1 -
 default/crawlers_detection.conf | 260 --------------------------------
 doc/Makefile.am                 |   2 -
 doc/crawlers_detection.conf.pod |  37 -----
 doc/sympa_config.podpl          |   1 -
 doc/sympa_toc.pod               |   4 -
 src/lib/Conf.pm                 |  34 +----
 src/lib/Makefile.am             |   1 +
 src/lib/Sympa/WWW/Session.pm    |  18 +--
 support/README.support.md       |   7 +
 support/make_crawlers.pl        | 108 +++++++++++++
 11 files changed, 128 insertions(+), 345 deletions(-)
 delete mode 100644 default/crawlers_detection.conf
 delete mode 100644 doc/crawlers_detection.conf.pod
 create mode 100755 support/make_crawlers.pl

diff --git a/default/Makefile.am b/default/Makefile.am
index e1367bc20..c5da05e3b 100644
--- a/default/Makefile.am
+++ b/default/Makefile.am
@@ -28,7 +28,6 @@ nobase_nodist_default_DATA = \
 nobase_default_DATA = \
 	auth.conf \
 	charset.conf \
-	crawlers_detection.conf \
 	create_list.conf \
 	edit_list.conf \
 	ldap_alias_entry.tt2 \
diff --git a/default/crawlers_detection.conf b/default/crawlers_detection.conf
deleted file mode 100644
index 753ed4ab5..000000000
--- a/default/crawlers_detection.conf
+++ /dev/null
@@ -1,260 +0,0 @@
-# Robots user agent string. Source http://www.useragentstring.com/pages/useragentstring.php (09/2007)
-
-user_agent_string ABACHOBot
-
-user_agent_string AbiLogicBot
-
-user_agent_string Accoona-AI-Agent
-
-user_agent_string AnyApexBot
-
-user_agent_string Arachmo
-
-user_agent_string B-l-i-t-z-B-O-T
-
-user_agent_string Baiduspider
-
-user_agent_string BecomeBot
-
-user_agent_string Bimbot
-
-user_agent_string BlitzBOT
-
-user_agent_string boitho.com-dc
-
-user_agent_string boitho.com-robot
-
-user_agent_string btbot
-
-user_agent_string Cerberian Drtrs
-
-user_agent_string ConveraCrawler
-
-user_agent_string cosmos
-
-user_agent_string DataparkSearch
-
-user_agent_string DiamondBot
-
-user_agent_string EmeraldShield.com \WebBot
-
-user_agent_string envolk[ITS]spider
-
-user_agent_string EsperanzaBot
-
-user_agent_string Exabot
-
-user_agent_string FAST Enterprise Crawler
-
-user_agent_string FAST-WebCrawler
-
-user_agent_string FDSE robot 
-
-user_agent_string FindLinks
-
-user_agent_string FurlBot
-
-user_agent_string FyberSpider
-
-user_agent_string g2crawler
-
-user_agent_string Gaisbot
-
-user_agent_string genieBot
-
-user_agent_string Gigabot
-
-user_agent_string Girafabot
-
-user_agent_string Googlebot
-
-user_agent_string Googlebot-Image
-
-user_agent_string hl_ftien_spider
-
-user_agent_string htdig
-
-user_agent_string ia_archiver
-
-user_agent_string ichiro
-
-user_agent_string IRLbot
-
-user_agent_string IssueCrawler
-
-user_agent_string Java
-
-user_agent_string Jyxobot
-
-user_agent_string LapozzBot
-
-user_agent_string Larbin
-
-user_agent_string libwww-perl
-
-user_agent_string LinkWalker
-
-user_agent_string lmspider
-
-user_agent_string lwp-trivial
-
-user_agent_string mabontland
-
-user_agent_string Mediapartners-Google
-
-user_agent_string MJ12bot
-
-user_agent_string Mnogosearch
-
-user_agent_string mogimogi
-
-user_agent_string MojeekBot
-
-user_agent_string Morning Paper
-
-user_agent_string msnbot
-
-user_agent_string MSRBot
-
-user_agent_string MVAClient
-
-user_agent_string NetResearchServer
-
-user_agent_string NG-Search
-
-user_agent_string nicebot
-
-user_agent_string noxtrumbot
-
-user_agent_string Nusearch Spider
-
-user_agent_string NutchCVS
-
-user_agent_string obot
-
-user_agent_string oegp
-
-user_agent_string OmniExplorer_Bot
-
-user_agent_string Orbiter
-
-user_agent_string PageBitesHyperBot
-
-user_agent_string polybot
-
-user_agent_string Pompos
-
-user_agent_string Psbot
-
-user_agent_string PycURL
-
-user_agent_string Python-urllib
-
-user_agent_string RAMPyBot
-
-user_agent_string RufusBot
-
-user_agent_string SandCrawler
-
-user_agent_string SBIder
-
-user_agent_string Scrubby
-
-user_agent_string SearchSight
-
-user_agent_string Seekbot
-
-user_agent_string semanticdiscovery
-
-user_agent_string Sensis Web Crawler
-
-user_agent_string SEOChat::Bot
-
-user_agent_string Shim-Crawler
-
-user_agent_string ShopWiki
-
-user_agent_string Shoula robot
-
-user_agent_string silk
-
-user_agent_string Snappy
-
-user_agent_string sogou spider
-
-user_agent_string Speedy Spider
-
-user_agent_string Sqworm
-
-user_agent_string StackRambler
-
-user_agent_string SurveyBot
-
-user_agent_string SynooBot
-
-user_agent_string Teoma
-
-user_agent_string TerrawizBot
-
-user_agent_string TheSuBot
-
-user_agent_string Thumbnail.CZ robot
-
-user_agent_string TurnitinBot
-
-user_agent_string updated
-
-user_agent_string VoilaBot
-
-user_agent_string Vortex
-
-user_agent_string voyager
-
-user_agent_string VYU2
-
-user_agent_string webcollage
-
-user_agent_string Websquash.com
-
-user_agent_string wf84
-
-user_agent_string WoFindeIch Robot
-
-user_agent_string Xaldon_WebSpider
-
-user_agent_string yacy
-
-user_agent_string Yahoo! Slurp
-
-user_agent_string Yahoo! Slurp China
-
-user_agent_string YahooSeeker
-
-user_agent_string YahooSeeker-Testing
-
-user_agent_string yoogliFetchAgent
-
-user_agent_string Zao
-
-user_agent_string Zealbot
-
-user_agent_string zspider
-
-user_agent_string ZyBorg
-
-# OFFLINE BROWSERS
-
-user_agent_string Offline Explorer
-
-user_agent_string SuperBot
-
-user_agent_string Web Downloader
-
-user_agent_string WebCopier
-
-user_agent_string WebZIP
-
-# EmailSiphon
-
-user_agent_string EmailSiphon
-
diff --git a/doc/Makefile.am b/doc/Makefile.am
index 7bf022fec..ffb6dfd11 100644
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -54,7 +54,6 @@ man5_MANS = \
 	auth.conf.5 \
 	automatic_lists_description.conf.5 \
 	charset.conf.5 \
-	crawlers_detection.conf.5 \
 	edit_list.conf.5 \
 	ldap_alias_manager.conf.5 \
 	list_config.5 \
@@ -71,7 +70,6 @@ EXTRA_DIST = $(nobase_doc_DATA) \
 	auth.conf.pod \
 	automatic_lists_description.conf.pod \
 	charset.conf.pod \
-	crawlers_detection.conf.pod \
 	edit_list.conf.pod \
 	ldap_alias_manager.conf.pod \
 	list_config.pod \
diff --git a/doc/crawlers_detection.conf.pod b/doc/crawlers_detection.conf.pod
deleted file mode 100644
index 7fe5a68b4..000000000
--- a/doc/crawlers_detection.conf.pod
+++ /dev/null
@@ -1,37 +0,0 @@
-=encoding utf-8
-
-=head1 NAME
-
-crawlers_detection.conf - User agents to be excluded from session management
-
-=head1 DESCRIPTION
-
-F<crawlers_detection.conf> defines user agents to be excluded from session
-management by Sympa web interface.
-
-TBD.
-
-=head1 FILES
-
-=over
-
-=item F<$DEFAULTDIR/crawlers_detection.conf>
-
-Distribution default.  This file should not be edited.
-
-=item F<$SYSCONFDIR/E<lt>robot nameE<gt>/crawlers_detection.conf>
-
-Configuration file for each robot.
-
-=back
-
-=head1 SEE ALSO
-
-L<wwsympa(8)>.
-
-=head1 HISTORY
-
-This document was initially written by IKEDA Soji <ikeda@conversion.co.jp>.
-
-=cut
-
diff --git a/doc/sympa_config.podpl b/doc/sympa_config.podpl
index cfa7edcdd..507d1394f 100644
--- a/doc/sympa_config.podpl
+++ b/doc/sympa_config.podpl
@@ -575,7 +575,6 @@ L<https://www.sympa.community/manual/>.
 
 L<auth.conf(5)>,
 L<charset.conf(5)>,
-L<crawlers_detection.conf(5)>,
 L<edit_list.conf(5)>,
 L<ldap_alias_manager.conf(5)>,
 L<nrcpt_by_domain.conf(5)>,
diff --git a/doc/sympa_toc.pod b/doc/sympa_toc.pod
index 40a30cfa0..c1f662340 100644
--- a/doc/sympa_toc.pod
+++ b/doc/sympa_toc.pod
@@ -128,10 +128,6 @@ Configuration of authentication mechanisms for web interface of Sympa
 
 Configuration file for legacy character set support by Sympa
 
-=item L<crawlers_detection.conf(5)>
-
-User agents to be excluded from session management
-
 =item L<edit_list.conf(5)>
 
 Configuration of privileges to edit list configuration
diff --git a/src/lib/Conf.pm b/src/lib/Conf.pm
index db5cbdf71..f2cb309b9 100644
--- a/src/lib/Conf.pm
+++ b/src/lib/Conf.pm
@@ -1149,35 +1149,9 @@ sub load_trusted_application {
     return load_generic_conf_file($config_file, \%trusted_applications);
 }
 
-## load trusted_application.conf configuration file
-sub load_crawlers_detection {
-    my $that = shift || '*';
-
-    my %crawlers_detection_conf = (
-        'user_agent_string' => {
-            'occurrence' => '0-n',
-            'format'     => '.+'
-        }
-    );
-
-    my $config_file =
-        Sympa::search_fullpath($that, 'crawlers_detection.conf');
-    return undef unless $config_file and -r $config_file;
-    my $hashtab =
-        load_generic_conf_file($config_file, \%crawlers_detection_conf);
-    my $hashhash;
-
-    foreach my $kword (keys %{$hashtab}) {
-        # ignore comments and default
-        next
-            unless ($crawlers_detection_conf{$kword});
-        foreach my $value (@{$hashtab->{$kword}}) {
-            $hashhash->{$kword}{$value} = 'true';
-        }
-    }
-
-    return $hashhash;
-}
+# load crawlers_detection.conf configuration file
+# Deprecated.
+#sub load_crawlers_detection;
 
 ############################################################
 #  load_generic_conf_file
@@ -1657,8 +1631,6 @@ sub _load_server_specific_secondary_config_files {
 
     ## Load nrcpt_by_domain.conf
     $param->{'config_hash'}{'nrcpt_by_domain'} = load_nrcpt_by_domain();
-    $param->{'config_hash'}{'crawlers_detection'} =
-        load_crawlers_detection($param->{'config_hash'}{'robot_name'});
 }
 
 sub _infer_robot_parameter_values {
diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am
index d79c6f288..5ec68b0c8 100644
--- a/src/lib/Makefile.am
+++ b/src/lib/Makefile.am
@@ -236,6 +236,7 @@ nonCLI_modules = \
 	Sympa/Upgrade.pm \
 	Sympa/User.pm \
 	Sympa/WWW/Auth.pm \
+	Sympa/WWW/Crawlers.pm \
 	Sympa/WWW/FastCGI.pm \
 	Sympa/WWW/Marc.pm \
 	Sympa/WWW/Marc/Search.pm \
diff --git a/src/lib/Sympa/WWW/Session.pm b/src/lib/Sympa/WWW/Session.pm
index 797768427..9bca282fd 100644
--- a/src/lib/Sympa/WWW/Session.pm
+++ b/src/lib/Sympa/WWW/Session.pm
@@ -38,6 +38,7 @@ use Sympa::Language;
 use Sympa::Log;
 use Sympa::Tools::Data;
 use Sympa::Tools::Password;
+use Sympa::WWW::Crawlers;
 
 # this structure is used to define which session attributes are stored in a
 # dedicated database col where others are compiled in col 'data_session'
@@ -78,12 +79,10 @@ sub new {
 
     # passive_session are session not stored in the database, they are used
     # for crawler bots and action such as css, wsdl, ajax and rss
-    if (_is_a_crawler($robot)) {
-        $self->{'is_a_crawler'}    = 1;
-        $self->{'passive_session'} = 1;
-    }
+    $self->{'is_a_crawler'}    = _is_a_crawler($robot);
     $self->{'passive_session'} = 1
-        if $rss
+        if $self->{'is_a_crawler'}
+        or $rss
         or $action and ($action eq 'wsdl' or $action eq 'css');
 
     # if a session cookie exist, try to restore an existing session, don't
@@ -640,16 +639,17 @@ sub _generic_get_cookie {
 # DEPRECATED: No longer used.
 #sub check_cookie_extern;
 
-# input user agent string and IP. return 1 if suspected to be a crawler.
-# initial version based on rawlers_dtection.conf file only
-# later : use Session table to identify those who create a lot of sessions
+# input user agent string. return 1 if suspected to be a crawler.
 #FIXME: Robot context is ignored.
+my $crawler_re = Sympa::WWW::Crawlers::crawler();
+
 sub _is_a_crawler {
     my $robot = shift;
 
     my $ua = $ENV{'HTTP_USER_AGENT'};
     return undef unless defined $ua;
-    return $Conf::Conf{'crawlers_detection'}{'user_agent_string'}{$ua};
+    return undef unless $ua =~ $crawler_re;
+    return $1 || '?';
 }
 
 sub confirm_action {
diff --git a/support/README.support.md b/support/README.support.md
index fe35d29cd..10b934516 100644
--- a/support/README.support.md
+++ b/support/README.support.md
@@ -15,6 +15,13 @@ Initially taken from repository of rsync
 https://git.samba.org/?p=rsync.git;a=history;f=support/git-set-file-times
 at 2009-01-13, and made modifications.
 
+### make_crawlers.pl
+
+Generates `Sympa/WWW/Crawlers.pm` file, by running as:
+```
+make_crawlers.pl -o $MODULEDIR/Sympa/WWW/Crawlers.pm
+```
+
 ### pod2md
 
 Converts POD data to Markdown format.  This may be used as a replacement of
diff --git a/support/make_crawlers.pl b/support/make_crawlers.pl
new file mode 100755
index 000000000..b20395600
--- /dev/null
+++ b/support/make_crawlers.pl
@@ -0,0 +1,108 @@
+#!/usr/bin/env perl
+# -*- indent-tabs-mode: nil; -*-
+# vim:ft=perl:et:sw=4
+
+use strict;
+use warnings;
+use English qw(-no_match_vars);
+use Getopt::Long;
+use JSON qw();
+use LWP::Simple qw();
+
+use constant crawlers_url =>
+    'https://raw.githubusercontent.com/monperrus/crawler-user-agents/master/crawler-user-agents.json';
+
+my %opts;
+GetOptions(\%opts, 'output|o=s') or exit 1;
+
+my $crawlers = JSON->new->decode(LWP::Simple::get(crawlers_url()));
+die "No content.\n" unless ref $crawlers eq 'ARRAY';
+
+my @patterns = map {
+    if (ref $_ eq 'HASH' and defined $_->{pattern}) {
+        ($_->{pattern} =~ s/([ #{}])/[$1]/gr =~ s/\@/\\\@/gr =~
+                s/(?<![[\\])[.](?![]])/\\./gr);
+    } else {
+        ();
+    }
+} @$crawlers;
+die "No patterns.\n" unless @patterns;
+
+my $output = sprintf do { local $RS; <DATA> }, join "\n  | ", @patterns;
+eval $output;
+$EVAL_ERROR and die "$EVAL_ERROR\n";
+
+my $fh;
+if ($opts{output}) {
+    if ($opts{output} eq '-') {
+        $fh = *STDOUT;
+    } else {
+        open $fh, '>', $opts{output} or die "$ERRNO\n";
+    }
+} else {
+    my $dir = `dirname $0`;
+    chomp $dir;
+    open $fh, '>', "$dir/../src/lib/Sympa/WWW/Crawlers.pm"
+        or die "$ERRNO\n";
+}
+print $fh $output;
+
+__END__
+# -*- indent-tabs-mode: nil; -*-
+# vim:ft=perl:et:sw=4
+
+# Sympa - SYsteme de Multi-Postage Automatique
+
+# NOTE: This file is auto-generated.  Don't edit it manually.
+# Instead, modifications should be made on support/make_crawlers.pl file.
+
+package Sympa::WWW::Crawlers;
+
+use strict;
+use warnings;
+
+use constant crawler => qr{
+  (
+    %s
+  )
+}x;
+
+1;
+
+__END__
+=encoding utf-8
+
+=head1 NAME
+
+Sympa::WWW::Crawlers - Regular expression for User-Agent of web crawlers
+
+=head1 DESCRIPTION
+
+This module keeps definition of regular expressions used by Sympa software.
+
+The regular expression is generated from the data provided by the
+project below.
+
+=head1 SEE ALSO
+
+=over
+
+=item *
+
+Syntactic patterns of HTTP user-agents used by bots / robots / crawlers /
+scrapers / spiders
+
+L<https://github.com/monperrus/crawler-user-agents>
+
+=back
+
+
+=head1 HISTORY
+
+Crawler detection feature of WWSympa was introduced on Sympa 5.4a.4
+which derives information provided by L<http://www.useragentstring.com>.
+
+On Sympa 6.2.74, it was replaced with regular expression matching
+using information provided by crawler-user-agents project above.
+
+=cut

From 27cdbc944a2395b2eb87e83bc9e88767422e6caf Mon Sep 17 00:00:00 2001
From: IKEDA Soji <mail@ikedas.net>
Date: Sat, 17 Jun 2023 07:13:23 +0900
Subject: [PATCH 2/3] Bot would be logged in wwslog()

---
 src/cgi/wwsympa.fcgi.in | 45 +++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/src/cgi/wwsympa.fcgi.in b/src/cgi/wwsympa.fcgi.in
index c5d2e0279..7ffd261d1 100644
--- a/src/cgi/wwsympa.fcgi.in
+++ b/src/cgi/wwsympa.fcgi.in
@@ -1808,27 +1808,21 @@ Sympa::Spool::Listmaster->instance->flush(purge => 1);
 ## Write to log
 sub wwslog {
     my $facility = shift;
+    my $msg      = shift;
 
-    my $msg    = shift;
     my $remote = $ENV{'REMOTE_HOST'} || $ENV{'REMOTE_ADDR'};
-    my $wwsmsg = '';
-
-    $wwsmsg = "[list $param->{'list'}] " . $wwsmsg
-        if $param->{'list'};
-
-    $wwsmsg = "[user $param->{'user'}{'email'}] " . $wwsmsg
-        if $param->{'user'}{'email'};
-
-    $wwsmsg = "[rss] " . $wwsmsg
-        if $rss;
-
-    $wwsmsg = "[client $remote] " . $wwsmsg
-        if $remote;
-
-    $wwsmsg = "[session $session->{'id_session'}] " . $wwsmsg
-        if $session;
-
-    $wwsmsg = "[robot $robot] " . $wwsmsg;
+    my $bot = $session->{'is_a_crawler'} || undef if $session;
+
+    my $wwsmsg = join ' ',
+        grep {defined} (
+        "[robot $robot]",
+        ($session                  and "[session $session->{'id_session'}]"),
+        ($remote                   and "[client $remote]"),
+        ($bot                      and "[bot <$bot>]"),
+        ($rss                      and "[rss]"),
+        ($param->{'user'}{'email'} and "[user $param->{'user'}{'email'}]"),
+        ($param->{'list'}          and "[list $param->{'list'}]")
+        );
 
     push @_, $wwsmsg;
     if ($msg =~ /^([(][^)]*[)])\s*(.*)/s) {
@@ -8554,13 +8548,12 @@ sub do_arc {
         $param->{'file'} = $archive->{arc_directory} . '/' . $in{'arc_file'};
     }
 
-    $param->{'date'} = Sympa::Tools::File::get_mtime(
-        $archive->{arc_directory} . '/' . $in{'arc_file'});
-    # send page as static if client is a bot. That's prevent crawling all
-    # archices every weeks by google, yahoo and others bots
-    if ($session->{'is_a_crawler'}) {
-        $param->{'header_date'} = $param->{'date'};
-    }
+    # Send page as static if client is a bot.  That prevent crawling all
+    # archives every week by Google, Yahoo and other bots.
+    $param->{'header_date'} =
+        Sympa::Tools::File::get_mtime(
+        $archive->{arc_directory} . '/' . $in{'arc_file'})
+        if $session->{'is_a_crawler'};
     $param->{'archive_name'} = $in{'month'};
 
     #test pour différentier les action d'un robot et d'un simple abonné

From d4b4c9b549a7793b5d3fb04636c3af1022b16d6d Mon Sep 17 00:00:00 2001
From: IKEDA Soji <mail@ikedas.net>
Date: Mon, 10 Jun 2024 12:04:43 +0900
Subject: [PATCH 3/3] Add crawlers info

---
 src/lib/Sympa/WWW/Crawlers.pm | 630 ++++++++++++++++++++++++++++++++++
 1 file changed, 630 insertions(+)
 create mode 100644 src/lib/Sympa/WWW/Crawlers.pm

diff --git a/src/lib/Sympa/WWW/Crawlers.pm b/src/lib/Sympa/WWW/Crawlers.pm
new file mode 100644
index 000000000..493fc9202
--- /dev/null
+++ b/src/lib/Sympa/WWW/Crawlers.pm
@@ -0,0 +1,630 @@
+# -*- indent-tabs-mode: nil; -*-
+# vim:ft=perl:et:sw=4
+
+# Sympa - SYsteme de Multi-Postage Automatique
+
+# NOTE: This file is auto-generated.  Don't edit it manually.
+# Instead, modifications should be made on support/make_crawlers.pl file.
+
+package Sympa::WWW::Crawlers;
+
+use strict;
+use warnings;
+
+use constant crawler => qr{
+  (
+    Googlebot\/
+  | Googlebot-Mobile
+  | Googlebot-Image
+  | Googlebot-News
+  | Googlebot-Video
+  | AdsBot-Google([^-]|$)
+  | AdsBot-Google-Mobile
+  | Feedfetcher-Google
+  | Mediapartners-Google
+  | Mediapartners[ ]\(Googlebot\)
+  | APIs-Google
+  | Google-InspectionTool
+  | Storebot-Google
+  | GoogleOther
+  | bingbot
+  | Slurp
+  | [wW]get
+  | LinkedInBot
+  | Python-urllib
+  | python-requests
+  | aiohttp
+  | httpx
+  | libwww-perl
+  | httpunit
+  | Nutch
+  | Go-http-client
+  | phpcrawl
+  | msnbot
+  | jyxobot
+  | FAST-WebCrawler
+  | FAST[ ]Enterprise[ ]Crawler
+  | BIGLOTRON
+  | Teoma
+  | convera
+  | seekbot
+  | Gigabot
+  | Gigablast
+  | exabot
+  | ia_archiver
+  | GingerCrawler
+  | webmon[ ]
+  | HTTrack
+  | grub\.org
+  | UsineNouvelleCrawler
+  | antibot
+  | netresearchserver
+  | speedy
+  | fluffy
+  | findlink
+  | msrbot
+  | panscient
+  | yacybot
+  | AISearchBot
+  | ips-agent
+  | tagoobot
+  | MJ12bot
+  | woriobot
+  | yanga
+  | buzzbot
+  | mlbot
+  | yandex\.com\/bots
+  | purebot
+  | Linguee[ ]Bot
+  | CyberPatrol
+  | voilabot
+  | Baiduspider
+  | citeseerxbot
+  | spbot
+  | twengabot
+  | postrank
+  | Turnitin
+  | scribdbot
+  | page2rss
+  | sitebot
+  | linkdex
+  | Adidxbot
+  | ezooms
+  | dotbot
+  | Mail\.RU_Bot
+  | discobot
+  | heritrix
+  | findthatfile
+  | europarchive\.org
+  | NerdByNature\.Bot
+  | sistrix[ ]crawler
+  | Ahrefs(Bot|SiteAudit)
+  | fuelbot
+  | CrunchBot
+  | IndeedBot
+  | mappydata
+  | woobot
+  | ZoominfoBot
+  | PrivacyAwareBot
+  | Multiviewbot
+  | SWIMGBot
+  | Grobbot
+  | eright
+  | Apercite
+  | semanticbot
+  | Aboundex
+  | domaincrawler
+  | wbsearchbot
+  | summify
+  | CCBot
+  | edisterbot
+  | SeznamBot
+  | ec2linkfinder
+  | gslfbot
+  | aiHitBot
+  | intelium_bot
+  | facebookexternalhit
+  | Yeti
+  | RetrevoPageAnalyzer
+  | lb-spider
+  | Sogou
+  | lssbot
+  | careerbot
+  | wotbox
+  | wocbot
+  | ichiro
+  | DuckDuckBot
+  | lssrocketcrawler
+  | drupact
+  | webcompanycrawler
+  | acoonbot
+  | openindexspider
+  | gnam[ ]gnam[ ]spider
+  | web-archive-net\.com\.bot
+  | backlinkcrawler
+  | coccoc
+  | integromedb
+  | content[ ]crawler[ ]spider
+  | toplistbot
+  | it2media-domain-crawler
+  | ip-web-crawler\.com
+  | siteexplorer\.info
+  | elisabot
+  | proximic
+  | changedetection
+  | arabot
+  | WeSEE:Search
+  | niki-bot
+  | CrystalSemanticsBot
+  | rogerbot
+  | 360Spider
+  | psbot
+  | InterfaxScanBot
+  | CC[ ]Metadata[ ]Scaper
+  | g00g1e\.net
+  | GrapeshotCrawler
+  | urlappendbot
+  | brainobot
+  | fr-crawler
+  | binlar
+  | SimpleCrawler
+  | Twitterbot
+  | cXensebot
+  | smtbot
+  | bnf\.fr_bot
+  | A6-Indexer
+  | ADmantX
+  | Facebot
+  | OrangeBot\/
+  | memorybot
+  | AdvBot
+  | MegaIndex
+  | SemanticScholarBot
+  | ltx71
+  | nerdybot
+  | xovibot
+  | BUbiNG
+  | Qwantify
+  | archive\.org_bot
+  | Applebot
+  | TweetmemeBot
+  | crawler4j
+  | findxbot
+  | S[eE][mM]rushBot
+  | yoozBot
+  | lipperhey
+  | Y!J
+  | Domain[ ]Re-Animator[ ]Bot
+  | AddThis
+  | Screaming[ ]Frog[ ]SEO[ ]Spider
+  | MetaURI
+  | Scrapy
+  | Livelap[bB]ot
+  | OpenHoseBot
+  | CapsuleChecker
+  | collection\@infegy\.com
+  | IstellaBot
+  | DeuSu\/
+  | betaBot
+  | Cliqzbot\/
+  | MojeekBot\/
+  | netEstate[ ]NE[ ]Crawler
+  | SafeSearch[ ]microdata[ ]crawler
+  | Gluten[ ]Free[ ]Crawler\/
+  | Sonic
+  | Sysomos
+  | Trove
+  | deadlinkchecker
+  | Slack-ImgProxy
+  | Embedly
+  | RankActiveLinkBot
+  | iskanie
+  | SafeDNSBot
+  | SkypeUriPreview
+  | Veoozbot
+  | Slackbot
+  | redditbot
+  | datagnionbot
+  | Google-Adwords-Instant
+  | adbeat_bot
+  | WhatsApp
+  | contxbot
+  | pinterest\.com\/bot
+  | electricmonk
+  | GarlikCrawler
+  | BingPreview\/
+  | vebidoobot
+  | FemtosearchBot
+  | Yahoo[ ]Link[ ]Preview
+  | MetaJobBot
+  | DomainStatsBot
+  | mindUpBot
+  | Daum\/
+  | Jugendschutzprogramm-Crawler
+  | Xenu[ ]Link[ ]Sleuth
+  | Pcore-HTTP
+  | moatbot
+  | KosmioBot
+  | [pP]ingdom
+  | AppInsights
+  | PhantomJS
+  | Gowikibot
+  | PiplBot
+  | Discordbot
+  | TelegramBot
+  | Jetslide
+  | newsharecounts
+  | James[ ]BOT
+  | Bark[rR]owler
+  | TinEye
+  | SocialRankIOBot
+  | trendictionbot
+  | Ocarinabot
+  | epicbot
+  | Primalbot
+  | DuckDuckGo-Favicons-Bot
+  | GnowitNewsbot
+  | Leikibot
+  | LinkArchiver
+  | YaK\/
+  | PaperLiBot
+  | Digg[ ]Deeper
+  | dcrawl
+  | Snacktory
+  | AndersPinkBot
+  | Fyrebot
+  | EveryoneSocialBot
+  | Mediatoolkitbot
+  | Luminator-robots
+  | ExtLinksBot
+  | SurveyBot
+  | NING\/
+  | okhttp
+  | Nuzzel
+  | omgili
+  | PocketParser
+  | YisouSpider
+  | um-LN
+  | ToutiaoSpider
+  | MuckRack
+  | Jamie's[ ]Spider
+  | AHC\/
+  | NetcraftSurveyAgent
+  | Laserlikebot
+  | ^Apache-HttpClient
+  | AppEngine-Google
+  | Jetty
+  | Upflow
+  | Thinklab
+  | Traackr\.com
+  | Twurly
+  | Mastodon
+  | http_get
+  | DnyzBot
+  | botify
+  | 007ac9[ ]Crawler
+  | BehloolBot
+  | BrandVerity
+  | check_http
+  | BDCbot
+  | ZumBot
+  | EZID
+  | ICC-Crawler
+  | ArchiveBot
+  | ^LCC[ ]
+  | filterdb\.iss\.net\/crawler
+  | BLP_bbot
+  | BomboraBot
+  | Buck\/
+  | Companybook-Crawler
+  | Genieo
+  | magpie-crawler
+  | MeltwaterNews
+  | Moreover
+  | newspaper\/
+  | ScoutJet
+  | (^|[ ])sentry\/
+  | StorygizeBot
+  | UptimeRobot
+  | OutclicksBot
+  | seoscanners
+  | Hatena
+  | Google[ ]Web[ ]Preview
+  | MauiBot
+  | AlphaBot
+  | SBL-BOT
+  | IAS[ ]crawler
+  | adscanner
+  | Netvibes
+  | acapbot
+  | Baidu-YunGuanCe
+  | bitlybot
+  | blogmuraBot
+  | Bot\.AraTurka\.com
+  | bot-pge\.chlooe\.com
+  | BoxcarBot
+  | BTWebClient
+  | ContextAd[ ]Bot
+  | Digincore[ ]bot
+  | Disqus
+  | Feedly
+  | Fetch\/
+  | Fever
+  | Flamingo_SearchEngine
+  | FlipboardProxy
+  | g2reader-bot
+  | G2[ ]Web[ ]Services
+  | imrbot
+  | K7MLWCBot
+  | Kemvibot
+  | Landau-Media-Spider
+  | linkapediabot
+  | vkShare
+  | Siteimprove\.com
+  | BLEXBot\/
+  | DareBoost
+  | ZuperlistBot\/
+  | Miniflux\/
+  | Feedspot
+  | Diffbot\/
+  | SEOkicks
+  | tracemyfile
+  | Nimbostratus-Bot
+  | zgrab
+  | PR-CY\.RU
+  | AdsTxtCrawler
+  | Datafeedwatch
+  | Zabbix
+  | TangibleeBot
+  | google-xrawler
+  | axios
+  | Amazon[ ]CloudFront
+  | Pulsepoint
+  | CloudFlare-AlwaysOnline
+  | Google-Structured-Data-Testing-Tool
+  | WordupInfoSearch
+  | WebDataStats
+  | HttpUrlConnection
+  | Seekport[ ]Crawler
+  | ZoomBot
+  | VelenPublicWebCrawler
+  | MoodleBot
+  | jpg-newsbot
+  | outbrain
+  | W3C_Validator
+  | Validator\.nu
+  | W3C-checklink
+  | W3C-mobileOK
+  | W3C_I18n-Checker
+  | FeedValidator
+  | W3C_CSS_Validator
+  | W3C_Unicorn
+  | Google-PhysicalWeb
+  | Blackboard
+  | ICBot\/
+  | BazQux
+  | Twingly
+  | Rivva
+  | Experibot
+  | awesomecrawler
+  | Dataprovider\.com
+  | GroupHigh\/
+  | theoldreader\.com
+  | AnyEvent
+  | Uptimebot\.org
+  | Nmap[ ]Scripting[ ]Engine
+  | 2ip\.ru
+  | Clickagy
+  | Caliperbot
+  | MBCrawler
+  | online-webceo-bot
+  | B2B[ ]Bot
+  | AddSearchBot
+  | Google[ ]Favicon
+  | HubSpot
+  | Chrome-Lighthouse
+  | HeadlessChrome
+  | CheckMarkNetwork\/
+  | www\.uptime\.com
+  | Streamline3Bot\/
+  | serpstatbot\/
+  | MixnodeCache\/
+  | ^curl
+  | SimpleScraper
+  | RSSingBot
+  | Jooblebot
+  | fedoraplanet
+  | Friendica
+  | NextCloud
+  | Tiny[ ]Tiny[ ]RSS
+  | RegionStuttgartBot
+  | Bytespider
+  | Datanyze
+  | Google-Site-Verification
+  | TrendsmapResolver
+  | tweetedtimes
+  | NTENTbot
+  | Gwene
+  | SimplePie
+  | SearchAtlas
+  | Superfeedr
+  | feedbot
+  | UT-Dorkbot
+  | Amazonbot
+  | SerendeputyBot
+  | Eyeotabot
+  | officestorebot
+  | Neticle[ ]Crawler
+  | SurdotlyBot
+  | LinkisBot
+  | AwarioSmartBot
+  | AwarioRssBot
+  | RyteBot
+  | FreeWebMonitoring[ ]SiteChecker
+  | AspiegelBot
+  | NAVER[ ]Blog[ ]Rssbot
+  | zenback[ ]bot
+  | SentiBot
+  | Domains[ ]Project\/
+  | Pandalytics
+  | VKRobot
+  | bidswitchbot
+  | tigerbot
+  | NIXStatsbot
+  | Atom[ ]Feed[ ]Robot
+  | [Cc]urebot
+  | PagePeeker\/
+  | Vigil\/
+  | rssbot\/
+  | startmebot\/
+  | JobboerseBot
+  | seewithkids
+  | NINJA[ ]bot
+  | Cutbot
+  | BublupBot
+  | BrandONbot
+  | RidderBot
+  | Taboolabot
+  | Dubbotbot
+  | FindITAnswersbot
+  | infoobot
+  | Refindbot
+  | BlogTraffic\/\d\.\d+[ ]Feed-Fetcher
+  | SeobilityBot
+  | Cincraw
+  | Dragonbot
+  | VoluumDSP-content-bot
+  | FreshRSS
+  | BitBot
+  | ^PHP-Curl-Class
+  | Google-Certificates-Bridge
+  | centurybot
+  | Viber
+  | e\.ventures[ ]Investment[ ]Crawler
+  | evc-batch
+  | PetalBot
+  | virustotal
+  | (^|[ ])PTST\/
+  | minicrawler
+  | Cookiebot
+  | trovitBot
+  | seostar\.co
+  | IonCrawl
+  | Uptime-Kuma
+  | SeekportBot
+  | FreshpingBot
+  | Feedbin
+  | CriteoBot
+  | Snap[ ]URL[ ]Preview[ ]Service
+  | Better[ ]Uptime[ ]Bot
+  | RuxitSynthetic
+  | Google-Read-Aloud
+  | Valve\/Steam
+  | OdklBot\/
+  | GPTBot
+  | ChatGPT-User
+  | YandexRenderResourcesBot\/
+  | LightspeedSystemsCrawler
+  | ev-crawler\/
+  | BitSightBot\/
+  | woorankreview\/
+  | Google-Safety
+  | AwarioBot
+  | DataForSeoBot
+  | Linespider
+  | WellKnownBot
+  | A[ ]Patent[ ]Crawler
+  | StractBot
+  | search\.marginalia\.nu
+  | YouBot
+  | Nicecrawler
+  | Neevabot
+  | BrightEdge[ ]Crawler
+  | SiteCheckerBotCrawler
+  | TombaPublicWebCrawler
+  | CrawlyProjectCrawler
+  | KomodiaBot
+  | KStandBot
+  | CISPA[ ]Webcrawler
+  | MTRobot
+  | hyscore\.io
+  | AlexandriaOrgBot
+  | 2ip[ ]bot
+  | Yellowbrandprotectionbot
+  | SEOlizer
+  | vuhuvBot
+  | INETDEX-BOT
+  | Synapse
+  | t3versionsBot
+  | deepnoc
+  | Cocolyzebot
+  | hypestat
+  | ReverseEngineeringBot
+  | sempi\.tech
+  | Iframely
+  | MetaInspector
+  | node-fetch
+  | lkxscan
+  | python-opengraph
+  | OpenGraphCheck
+  | developers\.google\.com\/\+\/web\/snippet
+  | SenutoBot
+  | MaCoCu
+  | NewsBlur
+  | inoreader
+  | NetSystemsResearch
+  | PageThing
+  | WordPress\/
+  | PhxBot
+  | ImagesiftBot
+  | Expanse
+  | InternetMeasurement
+  | ^BW\/
+  | GeedoBot
+  | Audisto[ ]Crawler
+  | PerplexityBot\/
+  | [cC]laude[bB]ot
+  | Monsidobot
+  | GroupMeBot
+  )
+}x;
+
+1;
+
+__END__
+=encoding utf-8
+
+=head1 NAME
+
+Sympa::WWW::Crawlers - Regular expression for User-Agent of web crawlers
+
+=head1 DESCRIPTION
+
+This module keeps definition of regular expressions used by Sympa software.
+
+The regular expression is generated from the data provided by the
+project below.
+
+=head1 SEE ALSO
+
+=over
+
+=item *
+
+Syntactic patterns of HTTP user-agents used by bots / robots / crawlers /
+scrapers / spiders
+
+L<https://github.com/monperrus/crawler-user-agents>
+
+=back
+
+
+=head1 HISTORY
+
+Crawler detection feature of WWSympa was introduced on Sympa 5.4a.4
+which derives information provided by L<http://www.useragentstring.com>.
+
+On Sympa 6.2.74, it was replaced with regular expression matching
+using information provided by crawler-user-agents project above.
+
+=cut