From f6a23d06021e1c5244fa7880357aba11761080b4 Mon Sep 17 00:00:00 2001 From: Irina Peshinskaya Date: Tue, 23 Sep 2014 12:42:10 +0300 Subject: [PATCH] MT#9177 Fix device annotations sql error in devmod_edit. --- share/tools/mech1.pl | 286 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 286 insertions(+) create mode 100644 share/tools/mech1.pl diff --git a/share/tools/mech1.pl b/share/tools/mech1.pl new file mode 100644 index 0000000000..0f359d24ae --- /dev/null +++ b/share/tools/mech1.pl @@ -0,0 +1,286 @@ +#! /usr/bin/perl + +use strict; + +use Data::Dumper; +use WWW::Mechanize::Firefox; +use DBI; +use DateTime; +use DateTime::Format::ISO8601; +#use Storable qw/dclone/; +use Clone 'clone'; + +my $datetime = DateTime->from_epoch( + time_zone => DateTime::TimeZone->new(name => 'local'), + epoch => time(), +); +my $mech = WWW::Mechanize::Firefox->new(); +my $dbh = DBI->connect('dbi:mysql:spider;host=localhost', 'root', ''); +my $cfg = { + url_duplicate_template =>{ + rule => 'deny', + rule_allow => '', + } +}; + +#$dbh->do('drop table urltemplate'); +#$dbh->do('drop table url'); +#$dbh->do('drop table urlcontent'); +#$dbh->do('drop table urlvariant'); +#$dbh->do('drop table control'); +#$dbh->do('drop table url_control'); +#$dbh->do('drop table url_template'); +#$dbh->do('create table urltemplate(id integer(11) auto_increment primary key, template varchar(512))'); +#$dbh->do('create table url(id integer(11) unsigned not null auto_increment primary key, url varchar(512), urltemplate_id integer(11), lastvisit timestamp default 0)'); +#$dbh->do('create table urlvariant(id integer(11) unsigned not null, url varchar(512), container_url_id integer(11))'); +#$dbh->do('create table urlcontent(id integer(11) unsigned not null, content text)'); +#$dbh->do('create table control(id integer(11) unsigned not null auto_increment primary key, label text, goto_url_id integer(11) unsigned)'); +#$dbh->do('create table url_control(url_id integer(11) unsigned, control_id integer(11) unsigned)'); +#$dbh->do('create table url_template(url_id integer(11) unsigned, urltemplate_id integer(11) unsigned)'); + +# Xvfb :99 & +# DISPLAY=:99 firefox --display=:99 & +## DISPLAY=:99 xdotool search --onlyvisible --title firefox +## DISPLAY=:99 xdotool windowfocus 4194416 +# DISPLAY=:99 ~/fillform.pl + + +our $host = 'https://192.168.56.7:1444'; + + +get_data($mech,$host.'/dashboard'); +#while (my $url = $dbh->selectrow_array('select url from url where unix_timestamp(lastvisit) < ? limit 1', undef, $datetime->epoch)){ +# get_data($mech,$url); +#} + +sub get_data{ + my ($url_in,$mech,$container_url,$loaded) = {}; + ($mech,@{$url_in}{qw/url/},$container_url,$loaded) = @_; + print $url_in->{url}.";\n"; + if(!check_url_toadd($url_in->{url})){ + return; + } + my($url) = get_url_db($url_in); + my($urltemplate) = get_urltemplate_db({ + template => get_urltemplate($url->{url}), + }); + + my $url_toadd = 0; + if( $url && (!$url->{epoch}) || ( $url->{epoch} < $datetime->epoch) ){ + $url_toadd = 1; + if($container_url){ + $url_toadd = rule_url_template($container_url,$urltemplate); + } + } + + if($container_url){#after checking + register_url_template($container_url,$urltemplate); + } + + if($url_toadd){ + if(!$loaded){ + print pre_get_url($url->{url}).";\n"; + $mech->get(pre_get_url($url->{url})); + }else{ + #here add to db urls from clicks, if necessary + } + #$mech->eval("alert('QQ');"); + #follow_link starts here. Specially for follow_link we don't need recursion, but for click we do. + $url->{content} = \$mech->content(); + set_url_visited($url); + #foreach my $clickable_in ( $mech->clickables() ) { + # my $control = get_control_db({ + ## goto_url_id => $goto_url->{id}, + # label => process_control_label($clickable_in->{innerHTML}) + # } + # ); + # register_control($url,$control); + # #print Dumper $control; + # #$mech->click($control); + #} + #foreach my $link ( $mech->links() ) { + #foreach my $link ( $mech->find_all_links_dom() ) { + #my $repl = $mech->repl(); + my $contentDiv = $mech->xpath('//div[@id="content"]', single => 1); + my @links = $mech->find_all_links_dom( node => $contentDiv ); + #my @links = $mech->find_all_links_dom( ); + print $#links; + die(); + foreach my $link ( @links ) { + if(!check_url_toadd($link->{href})){ + next; + } + + my $control_url = { url => process_url($link->{href}) }; + my $goto_url = get_url_db( $control_url ); + if(! ($goto_url)){ + next; + } + my $control=get_control_db({ + goto_url_id => $goto_url->{id}, + label => process_control_label($link->{innerHTML} || $link->{href}) , + #label => process_control_label($link->text) + }); + register_control($url,$control); + + #print Dumper $link; + print "================================\n"; + print Dumper { map { $_ => $url->{$_}; } qw/id url urltemplate_id/}; + print Dumper $control_url; + print Dumper $control; + print Dumper $link->{tagName}; + if(( 'A' eq $link->{tagName}) && check_url_toadd($goto_url->{url})){ + #get_data($goto_url->{url},$url,1); + #get_data($goto_url->{url},$url,0); + #my $mechRecursion = clone($mech); + $SIG{ALRM} = \&request_timed_out; + eval { + alarm (10); + #$mechRecursion->follow_link($link); + print "follow_link\n"; + $mech->follow_link($link); + alarm(0); # Cancel the pending alarm if user responds. + }; + if('link_timed_out' eq $@){ + print "link hanged\n"; + }else{ + #get_data($mechRecursion,$goto_url->{url},$url,1); + print "get_data\n"; + get_data($mech,$goto_url->{url},$url,0); + eval { + alarm (10); + $mech->back(); + alarm(0); # Cancel the pending alarm if user responds. + }; + } + #if ($@ =~ /LOCAL_LINK/) { + # print "Timed out. Proceeding with default\n"; + #} + } + } + } +} +sub request_timed_out{ + die("link_timed_out"); +} +sub get_url_db{ + my($url) = @_; + my $url_db; + if(!($url_db = $dbh->selectrow_hashref('select *,unix_timestamp(lastvisit) as epoch from url where url=?',undef,$url->{url}))){ + $dbh->do('insert into url(url) values(?)',undef,$url->{url}); + $url_db = $url; + $url_db->{id} = $dbh->last_insert_id(undef,'spider','url','id'); + } + return $url_db; +} +sub set_url_visited{ + my($url_db) = @_; + print Dumper [ "update url set lastvisit=from_unixtime(?) where id=?\n", $datetime->epoch, $url_db->{id} ]; + $dbh->do('update url set lastvisit=from_unixtime(?) where id=?',undef,$datetime->epoch,$url_db->{id}); + #die(); + $dbh->do('update url_content set content=? where id=?',undef,${$url_db->{content}},$url_db->{id}); + $dbh->do('delete from url_control where url_id=?',undef,$url_db->{id}); +} +sub get_control_db{ + my($control) = @_; + my $control_db; + if(!($control_db = $dbh->selectrow_hashref('select * from control where label=? and goto_url_id=?',undef,@$control{qw/label goto_url_id/}))){ + $dbh->do('insert into control(label,goto_url_id) values(?,?)',undef,@$control{qw/label goto_url_id/}); + $control_db = $control; + $control_db->{id} = $dbh->last_insert_id(undef,'spider','control','id'); + #if(!(my $res=$dbh->selectrow_array('select id from url_variant where url=?'))){ + # + #} + } + return $control_db; +} +sub register_control{ + my($url,$control) = @_; + if(!(my $res = $dbh->selectrow_hashref('select * from url_control where url_id=? and control_id=?',undef,$url->{id},$control->{id}))){ + $dbh->do('insert into url_control(url_id,control_id)values(?,?)',undef,$url->{id},$control->{id}); + } +} +sub set_control_db{ + my($control) = @_; + $dbh->do('update control set label=?,goto_url_id=? where id=?',undef,@$control{qw/label goto_url_id id/}); +} +sub get_urltemplate{ + my($url) = @_; + $url = ~s/\d+/\d+/; + return $url; +} +sub get_urltemplate_db{ + my($template) = @_; + my $template_db; + if(!($template_db = $dbh->selectrow_hashref('select * from urltemplate where template=?',undef,$template->{template}))){ + $dbh->do('insert into urltemplate(template) values(?)',undef,$template->{template}); + $template_db = $template; + $template_db->{id} = $dbh->last_insert_id(undef,'spider','urltemplate','id'); + } + return $template_db; +} +sub check_url_toadd{ + my ($url) = @_; + my $res = 1; + $url=~s/\s+//g; + if(!$url){ + $res = 0; + } + if($url =~/javascript:;?$/i){ + $res = 0; + } + if($url =~/#$/i){ + $res = 0; + } + if($url =~/lang=[a-z]{2}/i){ + $res = 0; + } + return $res; +} +sub process_url{ + my ($url) = @_; + $url=~s/\Q$host\E//; + $url=~s/[&]?back=[^&]+//; + $url=~s/\?$//; + $url=~s/^\/$//; + return $url; +} +sub pre_get_url{ + my ($url) = @_; + $url=~s/\Q$host\E//; + if($url !~/^https?:/){ + $url=$host.'/'.$url; + } + $url =~ s!/+!/!g; + $url =~ s!(https?:(?:\d+)?)/+!$1//!g; + return $url; +} +sub process_control_label{ + my ($str) = @_; + $str=~s/^\s*|\s*$//gsm; + return $str; +} +##-------------------------------------- +sub rule_url_template{ + my($container_url,$urltemplate) = @_; + my $result = 1; + #here something - if config deny add duplicate template - don't add and parse duplicate url for the same template on the samepage + #ifconfig specify exceptions for deny - consider it + #if config specify allow duplicate - add duplicate, and consider defined deny config + if($cfg->{url_duplicate_template}->{rule} eq 'deny'){ + if(get_registered_url_template($container_url,$urltemplate)){ + $result = 0; + } + } + return $result; +} +sub register_url_template{ + my($container_url,$urltemplate) = @_; + $dbh->do('insert into url_template(url_id,urltemplate_id)values(?,?)',undef,$container_url->{id},$urltemplate->{id}); +} +sub get_registered_url_template{ + my($container_url,$urltemplate) = @_; + return $dbh->selectrow_array('select urltemplate_id from url_template where url_id=? and urltemplate_id=?',undef,$container_url->{id},$urltemplate->{id}); +} +1; +__END__