Skip to content
This repository has been archived by the owner on Jan 25, 2018. It is now read-only.

Commit

Permalink
Now session from crawlers are stored in sessiontable. Crawler detecti…
Browse files Browse the repository at this point in the history
…on is controled by /etc/crawlers_detection.conf.

Currently this file only list user_agent strings. In the futur it could detect crawlers by ip adress and later we could detect crawlers by analysing automatically the session table




git-svn-id: https://subversion.renater.fr/sympa/trunk@4601 05aa8bb8-cd2b-0410-b1d7-8918dfa770ce
  • Loading branch information
dverdin committed Sep 14, 2007
1 parent aa6dff3 commit 2212fdb
Show file tree
Hide file tree
Showing 5 changed files with 352 additions and 64 deletions.
42 changes: 40 additions & 2 deletions src/Conf.pm
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ my %trusted_applications = ('trusted_application' => {'occurrence' => '0-n',
}
);


my $wwsconf;
%Conf = ();

Expand Down Expand Up @@ -479,6 +480,9 @@ sub load {
$Conf{'sympa'} = "$Conf{'email'}\@$Conf{'host'}";
$Conf{'request'} = "$Conf{'email'}-request\@$Conf{'host'}";
$Conf{'trusted_applications'} = &load_trusted_application ();
$Conf{'crawlers_detection'} = &load_crawlers_detection ();

# open (TMP, ">> /tmp/dump1"); printf TMP "dump de la conf dans conf.pm\n" ; &tools::dump_var($Conf{'crawlers_detection'}, 0,\*TMP); close TMP;

$Conf{'pictures_url'} = $Conf{'static_content_url'}.'/pictures/';
$Conf{'pictures_path'} = $Conf{'static_content_path'}.'/pictures/';
Expand Down Expand Up @@ -680,6 +684,7 @@ sub load_robots {
}
# printf STDERR "load trusted de $robot";
$robot_conf->{$robot}{'trusted_applications'} = &load_trusted_application($robot);
$robot_conf->{$robot}{'crawlers_detection'} = &load_crawlers_detection($robot);
close (ROBOT_CONF);
}
closedir(DIR);
Expand Down Expand Up @@ -1194,7 +1199,40 @@ sub load_trusted_application {
# open TMP, ">/tmp/dump1";&tools::dump_var(&load_generic_conf_file($config,\%trusted_applications);, 0,\*TMP);close TMP;
return (&load_generic_conf_file($config,\%trusted_applications));

}


## load trusted_application.conf configuration file
sub load_crawlers_detection {
my $robot = shift;

my %crawlers_detection_conf = ('user_agent_string' => {'occurrence' => '0-n',
'format' => '.+'
} );

my $config ;
if (defined $robot) {
$config = $Conf{'etc'}.'/'.$robot.'/crawlers_detection.conf';
}else{
$config = $Conf{'etc'}.'/crawlers_detection.conf' ;
$config = '--ETCBINDIR--/crawlers_detection.conf' unless (-f $config);
}

print STDERR "crawlers_detection $config ($robot)\n";
return undef unless (-r $config);
my $hashtab = &load_generic_conf_file($config,\%crawlers_detection_conf);
my $hashhash ;


foreach my $kword (keys %{$hashtab}) {
next unless ($crawlers_detection_conf{$kword}); # ignore comments and default
foreach my $value (@{$hashtab->{$kword}}) {
$hashhash->{$kword}{$value} = 'true';
}
}
# open (TMP, ">> /tmp/dump1");printf TMP "retour de load_crawlers_detection : \n"; &tools::dump_var($hashhash, 0,\*TMP); close TMP; # xxxxxxxx

return $hashhash;
}

############################################################
Expand Down Expand Up @@ -1368,11 +1406,11 @@ sub load_generic_conf_file {
}else {
$admin{$pname} = \%hash;
}
}else {
}else{
## This should be a single line
my $xxxmachin = $structure{$pname}{'format'};
unless ($#paragraph == 0) {
printf STDERR 'Expecting a single line for %s parameter in %sxxxxxx %s\n', $pname, $config_file, $xxxmachin ;
printf STDERR 'Expecting a single line for %s parameter in %s %s\n', $pname, $config_file, $xxxmachin ;
return undef if $on_error eq 'abort';
}

Expand Down
260 changes: 260 additions & 0 deletions src/etc/crawlers_detection.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
# Robots user agent string. Source http://www.useragentstring.com/pages/useragentstring.php (09/2007)

user_agent_string ABACHOBot

user_agent_string AbiLogicBot

user_agent_string Accoona-AI-Agent

user_agent_string AnyApexBot

user_agent_string Arachmo

user_agent_string B-l-i-t-z-B-O-T

user_agent_string Baiduspider

user_agent_string BecomeBot

user_agent_string Bimbot

user_agent_string BlitzBOT

user_agent_string boitho.com-dc

user_agent_string boitho.com-robot

user_agent_string btbot

user_agent_string Cerberian Drtrs

user_agent_string ConveraCrawler

user_agent_string cosmos

user_agent_string DataparkSearch

user_agent_string DiamondBot

user_agent_string EmeraldShield.com \WebBot

user_agent_string envolk[ITS]spider

user_agent_string EsperanzaBot

user_agent_string Exabot

user_agent_string FAST Enterprise Crawler

user_agent_string FAST-WebCrawler

user_agent_string FDSE robot

user_agent_string FindLinks

user_agent_string FurlBot

user_agent_string FyberSpider

user_agent_string g2crawler

user_agent_string Gaisbot

user_agent_string genieBot

user_agent_string Gigabot

user_agent_string Girafabot

user_agent_string Googlebot

user_agent_string Googlebot-Image

user_agent_string hl_ftien_spider

user_agent_string htdig

user_agent_string ia_archiver

user_agent_string ichiro

user_agent_string IRLbot

user_agent_string IssueCrawler

user_agent_string Java

user_agent_string Jyxobot

user_agent_string LapozzBot

user_agent_string Larbin

user_agent_string libwww-perl

user_agent_string LinkWalker

user_agent_string lmspider

user_agent_string lwp-trivial

user_agent_string mabontland

user_agent_string Mediapartners-Google

user_agent_string MJ12bot

user_agent_string Mnogosearch

user_agent_string mogimogi

user_agent_string MojeekBot

user_agent_string Morning Paper

user_agent_string msnbot

user_agent_string MSRBot

user_agent_string MVAClient

user_agent_string NetResearchServer

user_agent_string NG-Search

user_agent_string nicebot

user_agent_string noxtrumbot

user_agent_string Nusearch Spider

user_agent_string NutchCVS

user_agent_string obot

user_agent_string oegp

user_agent_string OmniExplorer_Bot

user_agent_string Orbiter

user_agent_string PageBitesHyperBot

user_agent_string polybot

user_agent_string Pompos

user_agent_string Psbot

user_agent_string PycURL

user_agent_string Python-urllib

user_agent_string RAMPyBot

user_agent_string RufusBot

user_agent_string SandCrawler

user_agent_string SBIder

user_agent_string Scrubby

user_agent_string SearchSight

user_agent_string Seekbot

user_agent_string semanticdiscovery

user_agent_string Sensis Web Crawler

user_agent_string SEOChat::Bot

user_agent_string Shim-Crawler

user_agent_string ShopWiki

user_agent_string Shoula robot

user_agent_string silk

user_agent_string Snappy

user_agent_string sogou spider

user_agent_string Speedy Spider

user_agent_string Sqworm

user_agent_string StackRambler

user_agent_string SurveyBot

user_agent_string SynooBot

user_agent_string Teoma

user_agent_string TerrawizBot

user_agent_string TheSuBot

user_agent_string Thumbnail.CZ robot

user_agent_string TurnitinBot

user_agent_string updated

user_agent_string VoilaBot

user_agent_string Vortex

user_agent_string voyager

user_agent_string VYU2

user_agent_string webcollage

user_agent_string Websquash.com

user_agent_string wf84

user_agent_string WoFindeIch Robot

user_agent_string Xaldon_WebSpider

user_agent_string yacy

user_agent_string Yahoo! Slurp

user_agent_string Yahoo! Slurp China

user_agent_string YahooSeeker

user_agent_string YahooSeeker-Testing

user_agent_string yoogliFetchAgent

user_agent_string Zao

user_agent_string Zealbot

user_agent_string zspider

user_agent_string ZyBorg

# OFFLINE BROWSERS

user_agent_string Offline Explorer

user_agent_string SuperBot

user_agent_string Web Downloader

user_agent_string WebCopier

user_agent_string WebZIP

# EmailSiphon

user_agent_string EmailSiphon

21 changes: 21 additions & 0 deletions src/tools.pl
Original file line number Diff line number Diff line change
Expand Up @@ -2034,6 +2034,27 @@ sub remove_pid {
return 1;
}

# input user agent string and IP. return 1 if suspected to be a crawler.
# initial version based on rawlers_dtection.conf file only
# later : use Session table to identify those who create a lot of sessions
sub is_a_crawler {

my $robot = shift;
my $context = shift;

# if ($Conf{$robot}{'crawlers_detection'}) {
# return ($Conf{$robot}{'crawlers_detection'}{'user_agent_string'}{$context->{'user_agent_string'}});
# }

foreach my $xx (keys %{$context}){
&do_log ('info',"xxxxxxxxxxx is_a_crawler $xx = '$context->{$xx}'");
&do_log ('info',"yyyyyyyyyyy = $Conf{'crawlers_detection'}{'user_agent_string'}{$xx}");
}

# open (TMP, ">> /tmp/dump1"); printf TMP "dump de la conf dans is_a_crawler : \n"; &tools::dump_var($Conf{'crawlers_detection'}, 0,\*TMP); close TMP;
return $Conf{'crawlers_detection'}{'user_agent_string'}{$context->{'user_agent_string'}};
}

sub write_pid {
my ($pidfile, $pid) = @_;

Expand Down
Loading

0 comments on commit 2212fdb

Please sign in to comment.