You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
bulk-processor/lib/NGCP/BulkProcessor/SqlConnectors/CSVDB.pm

879 lines
23 KiB

package NGCP::BulkProcessor::SqlConnectors::CSVDB;
use strict;
## no critic
use NGCP::BulkProcessor::Globals qw(
$LongReadLen_limit
$csv_path);
use NGCP::BulkProcessor::Logging qw(
getlogger
dbdebug
dbinfo
xls2csvinfo
texttablecreated
indexcreated
tabletruncated
tabledropped);
use NGCP::BulkProcessor::LogError qw(
dberror
dbwarn
fieldnamesdiffer
fileerror
filewarn
xls2csverror
xls2csvwarn);
use NGCP::BulkProcessor::Array qw(contains setcontains);
use NGCP::BulkProcessor::Utils qw(makepath changemod chopstring);
use NGCP::BulkProcessor::SqlConnector;
use DBI;
use DBD::CSV 0.26;
use File::Path qw(remove_tree);
use Locale::Recode;
use Spreadsheet::ParseExcel;
use Spreadsheet::ParseExcel::FmtUnicode;
use Excel::Reader::XLSX;
use Text::CSV_XS;
use File::Basename;
use MIME::Parser;
use HTML::PullParser qw();
use HTML::Entities qw(decode_entities);
use IO::Uncompress::Unzip qw(unzip $UnzipError);
use File::Copy qw();
# no debian package yet:
#use DateTime::Format::Excel;
require Exporter;
our @ISA = qw(Exporter NGCP::BulkProcessor::SqlConnector);
our @EXPORT_OK = qw(
cleanupcsvdirs
xlsbin2csv
xlsxbin2csv
sanitize_column_name
sanitize_spreadsheet_name
get_tableidentifier
$csvextension
$default_csv_config);
#excel_to_timestamp
#excel_to_date
our $csvextension = '.csv';
our $default_csv_config = { eol => "\r\n",
sep_char => ';',
quote_char => '"',
escape_char => '"',
};
my @TABLE_TAGS = qw(table tr td);
my $LongReadLen = $LongReadLen_limit; #bytes
my $LongTruncOk = 0;
#my $logger = getlogger(__PACKAGE__);
#my $lock_do_chunk = 0;
#my $lock_get_chunk = 0;
my $rowblock_transactional = 0;
my $invalid_excel_spreadsheet_chars_pattern = '[' . quotemeta('[]:*?/\\') . ']';
sub sanitize_spreadsheet_name { #Invalid character []:*?/\ in worksheet name
my $spreadsheet_name = shift;
$spreadsheet_name =~ s/$invalid_excel_spreadsheet_chars_pattern//g;
return chopstring($spreadsheet_name,31); #Sheetname eventually inconsistent etc. must be <= 31 chars
}
sub sanitize_column_name {
my $column_name = shift;
$column_name =~ s/\W/_/g;
return $column_name;
}
#sub excel_to_date {
# my $excel_date_value = shift;
# if ($excel_date_value > 0) {
# my $datetime = DateTime::Format::Excel->parse_datetime($excel_date_value);
# return $datetime->ymd('-'); # prints 1992-02-28
# }
# return undef;
#}
#sub excel_to_timestamp {
# my $excel_datetime_value = shift;
# if ($excel_datetime_value > 0) {
# my $datetime = DateTime::Format::Excel->parse_datetime($excel_datetime_value);
# return $datetime->ymd('-') . ' ' . $datetime->hms(':');
# }
# return undef;
#}
sub new {
my $class = shift;
my $self = NGCP::BulkProcessor::SqlConnector->new(@_);
$self->{db_dir} = undef;
$self->{f_dir} = undef;
$self->{csv_tables} = undef;
$self->{files} = undef;
$self->{drh} = DBI->install_driver('CSV');
bless($self,$class);
dbdebug($self,__PACKAGE__ . ' connector created',getlogger(__PACKAGE__));
return $self;
}
sub _connectidentifier {
my $self = shift;
return $self->{f_dir};
}
sub tableidentifier {
my $self = shift;
my $tablename = shift;
return $tablename;
}
sub _columnidentifier {
my $self = shift;
my $columnname = shift;
return sanitize_column_name($columnname); #actually happens automatically by dbd::csv
}
sub get_tableidentifier {
my ($tablename,$db_dir) = @_;
if (defined $db_dir) {
return $db_dir . '.' . $tablename;
} else {
return $tablename;
}
}
sub getsafetablename {
my $self = shift;
my $tableidentifier = shift;
return lc($self->SUPER::getsafetablename($tableidentifier));
}
sub getdatabases {
my $self = shift;
local *DBDIR;
if (not opendir(DBDIR, $csv_path)) {
fileerror('cannot opendir ' . $csv_path . ': ' . $!,getlogger(__PACKAGE__));
return [];
}
my @dirs = grep { $_ ne '.' && $_ ne '..' && -d $csv_path . $_ } readdir(DBDIR);
closedir DBDIR;
my @databases = ();
foreach my $dir (@dirs) {
push @databases,$dir;
}
return \@databases;
}
sub _createdatabase {
my $self = shift;
my ($db_dir) = @_;
my $f_dir; # = _get_f_dir($db_dir);
if (length($db_dir) > 0) {
$f_dir = $csv_path . $db_dir . '/';
} else {
$f_dir = $csv_path;
}
dbinfo($self,'opening csv folder',getlogger(__PACKAGE__));
#mkdir $f_dir;
makepath($f_dir,\&fileerror,getlogger(__PACKAGE__));
#if (not -d $f_dir) {
# fileerror('cannot opendir ' . $f_dir . ': ' . $!,getlogger(__PACKAGE__));
# return;
#}
#local *DBDIR;
#if (not opendir(DBDIR, $f_dir)) {
# fileerror('cannot opendir ' . $f_dir . ': ' . $!,getlogger(__PACKAGE__));
# return;
#}
#closedir DBDIR;
#changemod($f_dir);
return $f_dir;
}
sub db_connect {
my $self = shift;
my ($db_dir,$csv_tables) = @_;
$self->SUPER::db_connect($db_dir,$csv_tables);
$self->{db_dir} = $db_dir;
$self->{csv_tables} = $csv_tables;
$self->{f_dir} = $self->_createdatabase($db_dir);
my $dbh_config = {
f_schema => undef,
#f_lock => 0, n/a in 0.26 yet?
cvs_eol => $default_csv_config->{eol},
cvs_sep_char => $default_csv_config->{sep_char},
cvs_quote_char => $default_csv_config->{quote_char},
cvs_escape_char => $default_csv_config->{escape_char},
PrintError => 0,
RaiseError => 0,
};
my $usetabledef = 0;
if (defined $csv_tables and ref $csv_tables eq 'HASH') {
$usetabledef = 1;
} else {
$dbh_config->{f_dir} = $self->{f_dir};
$dbh_config->{f_ext} = $csvextension . '/r';
}
my $dbh = DBI->connect ('dbi:CSV:','','',$dbh_config) or
dberror($self,'error connecting: ' . $self->{drh}->errstr(),getlogger(__PACKAGE__));
$dbh->{InactiveDestroy} = 1;
$dbh->{LongReadLen} = $LongReadLen;
$dbh->{LongTruncOk} = $LongTruncOk;
$self->{dbh} = $dbh;
if ($usetabledef) {
my @files = ();
foreach my $tablename (keys %$csv_tables) {
$dbh->{csv_tables}->{$tablename} = $csv_tables->{$tablename};
push @files,$csv_tables->{$tablename}->{file};
dbinfo($self,'using ' . $csv_tables->{$tablename}->{file},getlogger(__PACKAGE__));
}
$self->{files} = \@files;
} else {
my @tablenames = $self->_list_tables();
foreach my $tablename (@tablenames) {
$dbh->{csv_tables}->{$tablename} = { eol => $default_csv_config->{eol},
sep_char => $default_csv_config->{sep_char},
quote_char => $default_csv_config->{quote_char},
escape_char => $default_csv_config->{escape_char},
}
}
}
dbinfo($self,'connected',getlogger(__PACKAGE__));
}
sub _list_tables {
my $self = shift;
my @table_list;
eval {
@table_list = map { local $_ = $_; s/^\.\///g; $_; } $self->{dbh}->func('list_tables');
};
if ($@) {
my @tables;
eval {
@tables = $self->{dbh}->func("get_avail_tables") or return;
};
if ($@) {
dberror($self,'error listing csv tables: ' . $@,getlogger(__PACKAGE__));
} else {
foreach my $ref (@tables) {
if (defined $ref) {
if (ref $ref eq 'ARRAY') {
push @table_list, $ref->[2];
#} else {
# push @table_list, $ref;
}
}
}
}
}
return @table_list; #removeduplicates(\@table_list);
}
sub _db_disconnect {
my $self = shift;
$self->SUPER::_db_disconnect();
}
sub vacuum {
my $self = shift;
my $tablename = shift;
}
sub cleanupcsvdirs {
my (@remainingdbdirs) = @_;
local *DBDIR;
if (not opendir(DBDIR, $csv_path)) {
fileerror('cannot opendir ' . $csv_path . ': ' . $!,getlogger(__PACKAGE__));
return;
}
my @dirs = grep { $_ ne '.' && $_ ne '..' && -d $csv_path . $_ } readdir(DBDIR);
closedir DBDIR;
my @remainingdbdirectories = ();
foreach my $dirname (@remainingdbdirs) {
push @remainingdbdirectories,$csv_path . $dirname . '/';
}
foreach my $dir (@dirs) {
#print $file;
my $dirpath = $csv_path . $dir . '/';
if (not contains($dirpath,\@remainingdbdirectories)) {
#if (remove_tree($dirpath) == 0) {
# filewarn('cannot remove ' . $dirpath . ': ' . $!,getlogger(__PACKAGE__));
#}
remove_tree($dirpath, {
'keep_root' => 0,
'verbose' => 1,
'error' => \my $err });
if (@$err) {
for my $diag (@$err) {
my ($file, $message) = %$diag;
if ($file eq '') {
filewarn("cleanup: $message",getlogger(__PACKAGE__));
} else {
filewarn("problem unlinking $file: $message",getlogger(__PACKAGE__));
}
}
}
}
}
}
sub getfieldnames {
my $self = shift;
my $tablename = shift;
my $fieldnames = [];
if (defined $self->{dbh}) {
my $query = 'SELECT * FROM ' . $self->tableidentifier($tablename) . ' LIMIT 1';
dbdebug($self,'getfieldnames: ' . $query,getlogger(__PACKAGE__));
my $sth = $self->{dbh}->prepare($query) or $self->_prepare_error($query);
$sth->execute() or $self->_execute_error($query,$sth,());
$fieldnames = $sth->{NAME};
$sth->finish();
}
return $fieldnames;
}
sub getprimarykeycols {
my $self = shift;
my $tablename = shift;
return [];
}
sub create_primarykey {
my $self = shift;
my ($tablename,$keycols,$fieldnames) = @_;
return 0;
}
sub create_indexes {
my $self = shift;
my ($tablename,$indexes,$keycols) = @_;
return 0;
}
sub _gettablefilename {
my $self = shift;
my $tablename = shift;
return $self->{f_dir} . $tablename . $csvextension;
}
sub copytablefile {
my $self = shift;
my $tablename = shift;
my $target = shift;
my $drop_header_line = shift;
my $tablefilename = $self->_gettablefilename($tablename);
$self->db_disconnect();
if (File::Copy::copy($tablefilename,$target)) {
`sed -i '1d' $target` if $drop_header_line;
dbinfo($self,"$tablefilename copied to $target",getlogger(__PACKAGE__));
} else {
dberror($self,"copy from $tablefilename to $target failed: $!",getlogger(__PACKAGE__));
}
}
sub create_texttable {
my $self = shift;
my ($tablename,$fieldnames,$keycols,$indexes,$truncate) = @_;
if (length($tablename) > 0 and defined $fieldnames and ref $fieldnames eq 'ARRAY') {
my $created = 0;
if ($self->table_exists($tablename) == 0) {
if (not exists $self->{dbh}->{csv_tables}->{$tablename}) {
$self->{dbh}->{csv_tables}->{$tablename} = { eol => $default_csv_config->{eol},
sep_char => $default_csv_config->{sep_char},
quote_char => $default_csv_config->{quote_char},
escape_char => $default_csv_config->{escape_char},
};
}
my $statement = 'CREATE TABLE ' . $self->tableidentifier($tablename) . ' (';
$statement .= join(' TEXT, ',map { local $_ = $_; $_ = $self->columnidentifier($_); $_; } @$fieldnames) . ' TEXT';
$statement .= ')';
$self->db_do($statement);
changemod($self->_gettablefilename($tablename));
texttablecreated($self,$tablename,getlogger(__PACKAGE__));
$created = 1;
} else {
my $fieldnamesfound = $self->getfieldnames($tablename);
if (not setcontains($fieldnames,$fieldnamesfound,1)) {
fieldnamesdiffer($self,$tablename,$fieldnames,$fieldnamesfound,getlogger(__PACKAGE__));
return 0;
}
}
if (not $created and $truncate) {
$self->truncate_table($tablename);
}
return 1;
} else {
return 0;
}
}
sub multithreading_supported {
my $self = shift;
return 0;
}
sub rowblock_transactional {
my $self = shift;
return $rowblock_transactional;
}
sub truncate_table {
my $self = shift;
my $tablename = shift;
$self->db_do('DELETE FROM ' . $self->tableidentifier($tablename));
tabletruncated($self,$tablename,getlogger(__PACKAGE__));
}
sub table_exists {
my $self = shift;
my $tablename = shift;
if (defined $self->{dbh}) {
my @tables = $self->_list_tables();
return contains($tablename,\@tables);
}
return undef;
}
sub drop_table {
my $self = shift;
my $tablename = shift;
if ($self->table_exists($tablename) > 0) {
$self->db_do('DROP TABLE ' . $self->tableidentifier($tablename));
delete $self->{dbh}->{csv_tables}->{$tablename};
tabledropped($self,$tablename,getlogger(__PACKAGE__));
return 1;
}
return 0;
}
sub db_begin {
my $self = shift;
if (defined $self->{dbh}) {
dbdebug($self, "transactions not supported",getlogger(__PACKAGE__));
}
}
sub db_commit {
my $self = shift;
if (defined $self->{dbh}) {
dbdebug($self, "transactions not supported",getlogger(__PACKAGE__));
}
}
sub db_rollback {
my $self = shift;
if (defined $self->{dbh}) {
dbdebug($self, "transactions not supported",getlogger(__PACKAGE__));
}
}
sub db_do_begin {
my $self = shift;
my $query = shift;
#my $tablename = shift;
$self->SUPER::db_do_begin($query,$rowblock_transactional,@_);
}
sub db_get_begin {
my $self = shift;
my $query = shift;
#my $tablename = shift;
$self->SUPER::db_get_begin($query,$rowblock_transactional,@_);
}
sub db_finish {
my $self = shift;
my $rollback = shift;
$self->SUPER::db_finish($rowblock_transactional,$rollback);
}
sub xlsbin2csv {
my ($inputfile,$outputfile,$worksheetname,$sourcecharset) = @_;
return _convert_xlsbin2csv($inputfile,
$worksheetname,
$sourcecharset,
$outputfile,
'UTF-8',
$default_csv_config->{quote_char},
$default_csv_config->{escape_char},
$default_csv_config->{sep_char},
$default_csv_config->{eol});
}
sub _convert_xlsbin2csv {
my ($SourceFilename,$worksheet,$SourceCharset,$DestFilename,$DestCharset,$quote_char,$escape_char,$sep_char,$eol) = @_;
my $csvlinecount = 0;
xls2csvinfo('start converting ' . $SourceFilename . ' (worksheet ' . $worksheet . ') to ' . $DestFilename . ' ...',getlogger(__PACKAGE__));
$SourceCharset = 'UTF-8' unless $SourceCharset;
$DestCharset = $SourceCharset unless $DestCharset;
xls2csvinfo('reading ' . $SourceFilename . ' as ' . $SourceCharset,getlogger(__PACKAGE__));
my $XLS = IO::File->new();
if (not $XLS->open('<' . $SourceFilename)) {
fileerror('cannot open file ' . $SourceFilename . ': ' . $!,getlogger(__PACKAGE__));
return 0;
}
my $Formatter = Spreadsheet::ParseExcel::FmtUnicode->new(Unicode_Map => $SourceCharset);
my $parser = Spreadsheet::ParseExcel->new();
my $Book = $parser->parse($XLS,$Formatter); #$SourceFilename
if ( !defined $Book ) {
xls2csverror($parser->error(),getlogger(__PACKAGE__));
#die $parser->error(), ".\n";
$XLS->close();
return 0;
}
#my $Book = Spreadsheet::ParseExcel::Workbook->Parse($XLS, $Formatter) or xls2csverror('can\'t read spreadsheet',getlogger(__PACKAGE__));
my $Sheet;
if ($worksheet) {
#my $test = $Book->GetContent();
$Sheet = $Book->Worksheet($worksheet);
if (!defined $Sheet) {
xls2csverror('invalid spreadsheet',getlogger(__PACKAGE__));
return 0;
}
#unless ($O{'q'})
#{
# print qq|Converting the "$Sheet->{Name}" worksheet.\n|;
#}
xls2csvinfo('converting the ' . $Sheet->{Name} . ' worksheet',getlogger(__PACKAGE__));
} else {
($Sheet) = @{$Book->{Worksheet}};
if ($Book->{SheetCount}>1) {
#print qq|Multiple worksheets found. Will convert the "$Sheet->{Name}" worksheet.\n|;
xls2csvinfo('multiple worksheets found, converting ' . $Sheet->{Name},getlogger(__PACKAGE__));
}
}
unlink $DestFilename;
local *CSV;
if (not open(CSV,'>' . $DestFilename)) {
fileerror('cannot open file ' . $DestFilename . ': ' . $!,getlogger(__PACKAGE__));
$XLS->close();
return 0;
}
binmode CSV;
my $Csv = Text::CSV_XS->new({
'quote_char' => $quote_char,
'escape_char' => $escape_char,
'sep_char' => $sep_char,
'binary' => 1,
});
my $Recoder;
if ($DestCharset) {
$Recoder = Locale::Recode->new(from => $SourceCharset, to => $DestCharset);
}
for (my $Row = $Sheet->{MinRow}; defined $Sheet->{MaxRow} && $Row <= $Sheet->{MaxRow}; $Row++) {
my @Row;
for (my $Col = $Sheet->{MinCol}; defined $Sheet->{MaxCol} && $Col <= $Sheet->{MaxCol}; $Col++) {
my $Cell = $Sheet->{Cells}[$Row][$Col];
my $Value = "";
if ($Cell) {
$Value = $Cell->Value;
if ($Value eq 'GENERAL') {
# Sometimes numbers are read incorrectly as "GENERAL".
# In this case, the correct value should be in ->{Val}.
$Value = $Cell->{Val};
}
if ($DestCharset) {
$Recoder->recode($Value);
}
}
# We assume the line is blank if there is nothing in the first column.
last if $Col == $Sheet->{MinCol} and !$Value;
push(@Row,$Value);
}
next unless @Row;
my $Status = $Csv->combine(@Row);
if (!defined $Status) {
xls2csvwarn('csv error: ' . $Csv->error_input(),getlogger(__PACKAGE__));
}
if (defined $Status) {
print CSV $Csv->string();
if ($Row < $Sheet->{MaxRow}) {
print CSV $eol;
}
$csvlinecount++;
}
}
close CSV;
$XLS->close;
xls2csvinfo($csvlinecount . ' line(s) converted',getlogger(__PACKAGE__));
return $csvlinecount;
}
sub xlsxbin2csv {
my ($inputfile,$outputfile,$worksheetname) = @_;
return _convert_xlsxbin2csv($inputfile,
$worksheetname,
$outputfile,
'UTF-8',
$default_csv_config->{quote_char},
$default_csv_config->{escape_char},
$default_csv_config->{sep_char},
$default_csv_config->{eol});
}
sub _convert_xlsxbin2csv {
my ($SourceFilename,$worksheet,$DestFilename,$DestCharset,$quote_char,$escape_char,$sep_char,$eol) = @_;
my $csvlinecount = 0;
xls2csvinfo('start converting ' . $SourceFilename . ' (worksheet ' . $worksheet . ') to ' . $DestFilename . ' ...',getlogger(__PACKAGE__));
my $XLS = IO::File->new();
if (not $XLS->open('<' . $SourceFilename)) {
fileerror('cannot open file ' . $SourceFilename . ': ' . $!,getlogger(__PACKAGE__));
return 0;
} else {
$XLS->close();
}
#my $Formatter = Spreadsheet::ParseExcel::FmtUnicode->new(Unicode_Map => $SourceCharset);
my $reader = Excel::Reader::XLSX->new();
my $workbook = $reader->read_file($SourceFilename); #->parse($XLS,$Formatter); #$SourceFilename
my $SourceCharset = $workbook->{_reader}->encoding();
$DestCharset = $SourceCharset unless $DestCharset;
xls2csvinfo('reading ' . $SourceFilename . ' as ' . $SourceCharset,getlogger(__PACKAGE__));
if ( !defined $workbook ) {
xls2csverror($reader->error(),getlogger(__PACKAGE__));
#die $parser->error(), ".\n";
#$XLS->close();
return 0;
}
#my $Book = Spreadsheet::ParseExcel::Workbook->Parse($XLS, $Formatter) or xls2csverror('can\'t read spreadsheet',getlogger(__PACKAGE__));
my $sheet;
if ($worksheet) {
#my $test = $Book->GetContent();
$sheet = $workbook->worksheet($worksheet);
if (!defined $sheet) {
xls2csverror('invalid spreadsheet',getlogger(__PACKAGE__));
return 0;
}
#unless ($O{'q'})
#{
# print qq|Converting the "$Sheet->{Name}" worksheet.\n|;
#}
xls2csvinfo('converting the ' . $sheet->name() . ' worksheet',getlogger(__PACKAGE__));
} else {
$sheet = $workbook->worksheet(0);
if (@{$workbook->worksheets()} > 1) {
#print qq|Multiple worksheets found. Will convert the "$Sheet->{Name}" worksheet.\n|;
xls2csvinfo('multiple worksheets found, converting ' . $sheet->name(),getlogger(__PACKAGE__));
}
}
unlink $DestFilename;
local *CSV;
if (not open(CSV,'>' . $DestFilename)) {
fileerror('cannot open file ' . $DestFilename . ': ' . $!,getlogger(__PACKAGE__));
#$XLS->close();
return 0;
}
binmode CSV;
my $csv = Text::CSV_XS->new({
'quote_char' => $quote_char,
'escape_char' => $escape_char,
'sep_char' => $sep_char,
'binary' => 1,
});
my $Recoder;
if ($DestCharset) {
$Recoder = Locale::Recode->new(from => $SourceCharset, to => $DestCharset);
}
while ( my $row = $sheet->next_row() ) {
foreach my $value ($row->values()) {
$Recoder->recode($value);
}
my $status = $csv->combine($row->values());
if (!defined $status) {
xls2csvwarn('csv error: ' . $csv->error_input(),getlogger(__PACKAGE__));
}
if (defined $status) {
if ($row->row_number() > 0) {
print CSV $eol;
}
print CSV $csv->string();
$csvlinecount++;
}
}
close CSV;
#$XLS->close;
xls2csvinfo($csvlinecount . ' line(s) converted',getlogger(__PACKAGE__));
return $csvlinecount;
}
1;