parent
cdba3fd0ac
commit
f6a23d0602
@ -0,0 +1,286 @@
|
|||||||
|
#! /usr/bin/perl
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
use Data::Dumper;
|
||||||
|
use WWW::Mechanize::Firefox;
|
||||||
|
use DBI;
|
||||||
|
use DateTime;
|
||||||
|
use DateTime::Format::ISO8601;
|
||||||
|
#use Storable qw/dclone/;
|
||||||
|
use Clone 'clone';
|
||||||
|
|
||||||
|
my $datetime = DateTime->from_epoch(
|
||||||
|
time_zone => DateTime::TimeZone->new(name => 'local'),
|
||||||
|
epoch => time(),
|
||||||
|
);
|
||||||
|
my $mech = WWW::Mechanize::Firefox->new();
|
||||||
|
my $dbh = DBI->connect('dbi:mysql:spider;host=localhost', 'root', '');
|
||||||
|
my $cfg = {
|
||||||
|
url_duplicate_template =>{
|
||||||
|
rule => 'deny',
|
||||||
|
rule_allow => '',
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#$dbh->do('drop table urltemplate');
|
||||||
|
#$dbh->do('drop table url');
|
||||||
|
#$dbh->do('drop table urlcontent');
|
||||||
|
#$dbh->do('drop table urlvariant');
|
||||||
|
#$dbh->do('drop table control');
|
||||||
|
#$dbh->do('drop table url_control');
|
||||||
|
#$dbh->do('drop table url_template');
|
||||||
|
#$dbh->do('create table urltemplate(id integer(11) auto_increment primary key, template varchar(512))');
|
||||||
|
#$dbh->do('create table url(id integer(11) unsigned not null auto_increment primary key, url varchar(512), urltemplate_id integer(11), lastvisit timestamp default 0)');
|
||||||
|
#$dbh->do('create table urlvariant(id integer(11) unsigned not null, url varchar(512), container_url_id integer(11))');
|
||||||
|
#$dbh->do('create table urlcontent(id integer(11) unsigned not null, content text)');
|
||||||
|
#$dbh->do('create table control(id integer(11) unsigned not null auto_increment primary key, label text, goto_url_id integer(11) unsigned)');
|
||||||
|
#$dbh->do('create table url_control(url_id integer(11) unsigned, control_id integer(11) unsigned)');
|
||||||
|
#$dbh->do('create table url_template(url_id integer(11) unsigned, urltemplate_id integer(11) unsigned)');
|
||||||
|
|
||||||
|
# Xvfb :99 &
|
||||||
|
# DISPLAY=:99 firefox --display=:99 &
|
||||||
|
## DISPLAY=:99 xdotool search --onlyvisible --title firefox
|
||||||
|
## DISPLAY=:99 xdotool windowfocus 4194416
|
||||||
|
# DISPLAY=:99 ~/fillform.pl
|
||||||
|
|
||||||
|
|
||||||
|
our $host = 'https://192.168.56.7:1444';
|
||||||
|
|
||||||
|
|
||||||
|
get_data($mech,$host.'/dashboard');
|
||||||
|
#while (my $url = $dbh->selectrow_array('select url from url where unix_timestamp(lastvisit) < ? limit 1', undef, $datetime->epoch)){
|
||||||
|
# get_data($mech,$url);
|
||||||
|
#}
|
||||||
|
|
||||||
|
sub get_data{
|
||||||
|
my ($url_in,$mech,$container_url,$loaded) = {};
|
||||||
|
($mech,@{$url_in}{qw/url/},$container_url,$loaded) = @_;
|
||||||
|
print $url_in->{url}.";\n";
|
||||||
|
if(!check_url_toadd($url_in->{url})){
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
my($url) = get_url_db($url_in);
|
||||||
|
my($urltemplate) = get_urltemplate_db({
|
||||||
|
template => get_urltemplate($url->{url}),
|
||||||
|
});
|
||||||
|
|
||||||
|
my $url_toadd = 0;
|
||||||
|
if( $url && (!$url->{epoch}) || ( $url->{epoch} < $datetime->epoch) ){
|
||||||
|
$url_toadd = 1;
|
||||||
|
if($container_url){
|
||||||
|
$url_toadd = rule_url_template($container_url,$urltemplate);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if($container_url){#after checking
|
||||||
|
register_url_template($container_url,$urltemplate);
|
||||||
|
}
|
||||||
|
|
||||||
|
if($url_toadd){
|
||||||
|
if(!$loaded){
|
||||||
|
print pre_get_url($url->{url}).";\n";
|
||||||
|
$mech->get(pre_get_url($url->{url}));
|
||||||
|
}else{
|
||||||
|
#here add to db urls from clicks, if necessary
|
||||||
|
}
|
||||||
|
#$mech->eval("alert('QQ');");
|
||||||
|
#follow_link starts here. Specially for follow_link we don't need recursion, but for click we do.
|
||||||
|
$url->{content} = \$mech->content();
|
||||||
|
set_url_visited($url);
|
||||||
|
#foreach my $clickable_in ( $mech->clickables() ) {
|
||||||
|
# my $control = get_control_db({
|
||||||
|
## goto_url_id => $goto_url->{id},
|
||||||
|
# label => process_control_label($clickable_in->{innerHTML})
|
||||||
|
# }
|
||||||
|
# );
|
||||||
|
# register_control($url,$control);
|
||||||
|
# #print Dumper $control;
|
||||||
|
# #$mech->click($control);
|
||||||
|
#}
|
||||||
|
#foreach my $link ( $mech->links() ) {
|
||||||
|
#foreach my $link ( $mech->find_all_links_dom() ) {
|
||||||
|
#my $repl = $mech->repl();
|
||||||
|
my $contentDiv = $mech->xpath('//div[@id="content"]', single => 1);
|
||||||
|
my @links = $mech->find_all_links_dom( node => $contentDiv );
|
||||||
|
#my @links = $mech->find_all_links_dom( );
|
||||||
|
print $#links;
|
||||||
|
die();
|
||||||
|
foreach my $link ( @links ) {
|
||||||
|
if(!check_url_toadd($link->{href})){
|
||||||
|
next;
|
||||||
|
}
|
||||||
|
|
||||||
|
my $control_url = { url => process_url($link->{href}) };
|
||||||
|
my $goto_url = get_url_db( $control_url );
|
||||||
|
if(! ($goto_url)){
|
||||||
|
next;
|
||||||
|
}
|
||||||
|
my $control=get_control_db({
|
||||||
|
goto_url_id => $goto_url->{id},
|
||||||
|
label => process_control_label($link->{innerHTML} || $link->{href}) ,
|
||||||
|
#label => process_control_label($link->text)
|
||||||
|
});
|
||||||
|
register_control($url,$control);
|
||||||
|
|
||||||
|
#print Dumper $link;
|
||||||
|
print "================================\n";
|
||||||
|
print Dumper { map { $_ => $url->{$_}; } qw/id url urltemplate_id/};
|
||||||
|
print Dumper $control_url;
|
||||||
|
print Dumper $control;
|
||||||
|
print Dumper $link->{tagName};
|
||||||
|
if(( 'A' eq $link->{tagName}) && check_url_toadd($goto_url->{url})){
|
||||||
|
#get_data($goto_url->{url},$url,1);
|
||||||
|
#get_data($goto_url->{url},$url,0);
|
||||||
|
#my $mechRecursion = clone($mech);
|
||||||
|
$SIG{ALRM} = \&request_timed_out;
|
||||||
|
eval {
|
||||||
|
alarm (10);
|
||||||
|
#$mechRecursion->follow_link($link);
|
||||||
|
print "follow_link\n";
|
||||||
|
$mech->follow_link($link);
|
||||||
|
alarm(0); # Cancel the pending alarm if user responds.
|
||||||
|
};
|
||||||
|
if('link_timed_out' eq $@){
|
||||||
|
print "link hanged\n";
|
||||||
|
}else{
|
||||||
|
#get_data($mechRecursion,$goto_url->{url},$url,1);
|
||||||
|
print "get_data\n";
|
||||||
|
get_data($mech,$goto_url->{url},$url,0);
|
||||||
|
eval {
|
||||||
|
alarm (10);
|
||||||
|
$mech->back();
|
||||||
|
alarm(0); # Cancel the pending alarm if user responds.
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#if ($@ =~ /LOCAL_LINK/) {
|
||||||
|
# print "Timed out. Proceeding with default\n";
|
||||||
|
#}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sub request_timed_out{
|
||||||
|
die("link_timed_out");
|
||||||
|
}
|
||||||
|
sub get_url_db{
|
||||||
|
my($url) = @_;
|
||||||
|
my $url_db;
|
||||||
|
if(!($url_db = $dbh->selectrow_hashref('select *,unix_timestamp(lastvisit) as epoch from url where url=?',undef,$url->{url}))){
|
||||||
|
$dbh->do('insert into url(url) values(?)',undef,$url->{url});
|
||||||
|
$url_db = $url;
|
||||||
|
$url_db->{id} = $dbh->last_insert_id(undef,'spider','url','id');
|
||||||
|
}
|
||||||
|
return $url_db;
|
||||||
|
}
|
||||||
|
sub set_url_visited{
|
||||||
|
my($url_db) = @_;
|
||||||
|
print Dumper [ "update url set lastvisit=from_unixtime(?) where id=?\n", $datetime->epoch, $url_db->{id} ];
|
||||||
|
$dbh->do('update url set lastvisit=from_unixtime(?) where id=?',undef,$datetime->epoch,$url_db->{id});
|
||||||
|
#die();
|
||||||
|
$dbh->do('update url_content set content=? where id=?',undef,${$url_db->{content}},$url_db->{id});
|
||||||
|
$dbh->do('delete from url_control where url_id=?',undef,$url_db->{id});
|
||||||
|
}
|
||||||
|
sub get_control_db{
|
||||||
|
my($control) = @_;
|
||||||
|
my $control_db;
|
||||||
|
if(!($control_db = $dbh->selectrow_hashref('select * from control where label=? and goto_url_id=?',undef,@$control{qw/label goto_url_id/}))){
|
||||||
|
$dbh->do('insert into control(label,goto_url_id) values(?,?)',undef,@$control{qw/label goto_url_id/});
|
||||||
|
$control_db = $control;
|
||||||
|
$control_db->{id} = $dbh->last_insert_id(undef,'spider','control','id');
|
||||||
|
#if(!(my $res=$dbh->selectrow_array('select id from url_variant where url=?'))){
|
||||||
|
#
|
||||||
|
#}
|
||||||
|
}
|
||||||
|
return $control_db;
|
||||||
|
}
|
||||||
|
sub register_control{
|
||||||
|
my($url,$control) = @_;
|
||||||
|
if(!(my $res = $dbh->selectrow_hashref('select * from url_control where url_id=? and control_id=?',undef,$url->{id},$control->{id}))){
|
||||||
|
$dbh->do('insert into url_control(url_id,control_id)values(?,?)',undef,$url->{id},$control->{id});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sub set_control_db{
|
||||||
|
my($control) = @_;
|
||||||
|
$dbh->do('update control set label=?,goto_url_id=? where id=?',undef,@$control{qw/label goto_url_id id/});
|
||||||
|
}
|
||||||
|
sub get_urltemplate{
|
||||||
|
my($url) = @_;
|
||||||
|
$url = ~s/\d+/\d+/;
|
||||||
|
return $url;
|
||||||
|
}
|
||||||
|
sub get_urltemplate_db{
|
||||||
|
my($template) = @_;
|
||||||
|
my $template_db;
|
||||||
|
if(!($template_db = $dbh->selectrow_hashref('select * from urltemplate where template=?',undef,$template->{template}))){
|
||||||
|
$dbh->do('insert into urltemplate(template) values(?)',undef,$template->{template});
|
||||||
|
$template_db = $template;
|
||||||
|
$template_db->{id} = $dbh->last_insert_id(undef,'spider','urltemplate','id');
|
||||||
|
}
|
||||||
|
return $template_db;
|
||||||
|
}
|
||||||
|
sub check_url_toadd{
|
||||||
|
my ($url) = @_;
|
||||||
|
my $res = 1;
|
||||||
|
$url=~s/\s+//g;
|
||||||
|
if(!$url){
|
||||||
|
$res = 0;
|
||||||
|
}
|
||||||
|
if($url =~/javascript:;?$/i){
|
||||||
|
$res = 0;
|
||||||
|
}
|
||||||
|
if($url =~/#$/i){
|
||||||
|
$res = 0;
|
||||||
|
}
|
||||||
|
if($url =~/lang=[a-z]{2}/i){
|
||||||
|
$res = 0;
|
||||||
|
}
|
||||||
|
return $res;
|
||||||
|
}
|
||||||
|
sub process_url{
|
||||||
|
my ($url) = @_;
|
||||||
|
$url=~s/\Q$host\E//;
|
||||||
|
$url=~s/[&]?back=[^&]+//;
|
||||||
|
$url=~s/\?$//;
|
||||||
|
$url=~s/^\/$//;
|
||||||
|
return $url;
|
||||||
|
}
|
||||||
|
sub pre_get_url{
|
||||||
|
my ($url) = @_;
|
||||||
|
$url=~s/\Q$host\E//;
|
||||||
|
if($url !~/^https?:/){
|
||||||
|
$url=$host.'/'.$url;
|
||||||
|
}
|
||||||
|
$url =~ s!/+!/!g;
|
||||||
|
$url =~ s!(https?:(?:\d+)?)/+!$1//!g;
|
||||||
|
return $url;
|
||||||
|
}
|
||||||
|
sub process_control_label{
|
||||||
|
my ($str) = @_;
|
||||||
|
$str=~s/^\s*|\s*$//gsm;
|
||||||
|
return $str;
|
||||||
|
}
|
||||||
|
##--------------------------------------
|
||||||
|
sub rule_url_template{
|
||||||
|
my($container_url,$urltemplate) = @_;
|
||||||
|
my $result = 1;
|
||||||
|
#here something - if config deny add duplicate template - don't add and parse duplicate url for the same template on the samepage
|
||||||
|
#ifconfig specify exceptions for deny - consider it
|
||||||
|
#if config specify allow duplicate - add duplicate, and consider defined deny config
|
||||||
|
if($cfg->{url_duplicate_template}->{rule} eq 'deny'){
|
||||||
|
if(get_registered_url_template($container_url,$urltemplate)){
|
||||||
|
$result = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return $result;
|
||||||
|
}
|
||||||
|
sub register_url_template{
|
||||||
|
my($container_url,$urltemplate) = @_;
|
||||||
|
$dbh->do('insert into url_template(url_id,urltemplate_id)values(?,?)',undef,$container_url->{id},$urltemplate->{id});
|
||||||
|
}
|
||||||
|
sub get_registered_url_template{
|
||||||
|
my($container_url,$urltemplate) = @_;
|
||||||
|
return $dbh->selectrow_array('select urltemplate_id from url_template where url_id=? and urltemplate_id=?',undef,$container_url->{id},$urltemplate->{id});
|
||||||
|
}
|
||||||
|
1;
|
||||||
|
__END__
|
Loading…
Reference in new issue