You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
879 lines
23 KiB
879 lines
23 KiB
package NGCP::BulkProcessor::SqlConnectors::CSVDB;
|
|
use strict;
|
|
|
|
## no critic
|
|
|
|
use NGCP::BulkProcessor::Globals qw(
|
|
$LongReadLen_limit
|
|
$csv_path);
|
|
|
|
use NGCP::BulkProcessor::Logging qw(
|
|
getlogger
|
|
dbdebug
|
|
dbinfo
|
|
xls2csvinfo
|
|
texttablecreated
|
|
indexcreated
|
|
tabletruncated
|
|
tabledropped);
|
|
|
|
use NGCP::BulkProcessor::LogError qw(
|
|
dberror
|
|
dbwarn
|
|
fieldnamesdiffer
|
|
fileerror
|
|
filewarn
|
|
xls2csverror
|
|
xls2csvwarn);
|
|
|
|
use NGCP::BulkProcessor::Array qw(contains setcontains);
|
|
|
|
use NGCP::BulkProcessor::Utils qw(makepath changemod chopstring);
|
|
|
|
use NGCP::BulkProcessor::SqlConnector;
|
|
|
|
use DBI;
|
|
use DBD::CSV 0.26;
|
|
use File::Path qw(remove_tree);
|
|
use Locale::Recode;
|
|
use Spreadsheet::ParseExcel;
|
|
use Spreadsheet::ParseExcel::FmtUnicode;
|
|
use Excel::Reader::XLSX;
|
|
use Text::CSV_XS;
|
|
use File::Basename;
|
|
use MIME::Parser;
|
|
use HTML::PullParser qw();
|
|
use HTML::Entities qw(decode_entities);
|
|
use IO::Uncompress::Unzip qw(unzip $UnzipError);
|
|
|
|
use File::Copy qw();
|
|
|
|
# no debian package yet:
|
|
#use DateTime::Format::Excel;
|
|
|
|
require Exporter;
|
|
our @ISA = qw(Exporter NGCP::BulkProcessor::SqlConnector);
|
|
our @EXPORT_OK = qw(
|
|
cleanupcsvdirs
|
|
xlsbin2csv
|
|
xlsxbin2csv
|
|
sanitize_column_name
|
|
sanitize_spreadsheet_name
|
|
get_tableidentifier
|
|
$csvextension
|
|
$default_csv_config);
|
|
#excel_to_timestamp
|
|
#excel_to_date
|
|
|
|
our $csvextension = '.csv';
|
|
|
|
our $default_csv_config = { eol => "\r\n",
|
|
sep_char => ';',
|
|
quote_char => '"',
|
|
escape_char => '"',
|
|
};
|
|
|
|
my @TABLE_TAGS = qw(table tr td);
|
|
|
|
my $LongReadLen = $LongReadLen_limit; #bytes
|
|
my $LongTruncOk = 0;
|
|
|
|
#my $logger = getlogger(__PACKAGE__);
|
|
|
|
#my $lock_do_chunk = 0;
|
|
#my $lock_get_chunk = 0;
|
|
my $rowblock_transactional = 0;
|
|
|
|
my $invalid_excel_spreadsheet_chars_pattern = '[' . quotemeta('[]:*?/\\') . ']';
|
|
|
|
sub sanitize_spreadsheet_name { #Invalid character []:*?/\ in worksheet name
|
|
my $spreadsheet_name = shift;
|
|
$spreadsheet_name =~ s/$invalid_excel_spreadsheet_chars_pattern//g;
|
|
return chopstring($spreadsheet_name,31); #Sheetname eventually inconsistent etc. must be <= 31 chars
|
|
}
|
|
|
|
sub sanitize_column_name {
|
|
my $column_name = shift;
|
|
$column_name =~ s/\W/_/g;
|
|
return $column_name;
|
|
}
|
|
|
|
#sub excel_to_date {
|
|
# my $excel_date_value = shift;
|
|
# if ($excel_date_value > 0) {
|
|
# my $datetime = DateTime::Format::Excel->parse_datetime($excel_date_value);
|
|
# return $datetime->ymd('-'); # prints 1992-02-28
|
|
# }
|
|
# return undef;
|
|
#}
|
|
|
|
#sub excel_to_timestamp {
|
|
# my $excel_datetime_value = shift;
|
|
# if ($excel_datetime_value > 0) {
|
|
# my $datetime = DateTime::Format::Excel->parse_datetime($excel_datetime_value);
|
|
# return $datetime->ymd('-') . ' ' . $datetime->hms(':');
|
|
# }
|
|
# return undef;
|
|
#}
|
|
|
|
sub new {
|
|
|
|
my $class = shift;
|
|
|
|
my $self = NGCP::BulkProcessor::SqlConnector->new(@_);
|
|
|
|
$self->{db_dir} = undef;
|
|
$self->{f_dir} = undef;
|
|
$self->{csv_tables} = undef;
|
|
$self->{files} = undef;
|
|
|
|
$self->{drh} = DBI->install_driver('CSV');
|
|
|
|
bless($self,$class);
|
|
|
|
dbdebug($self,__PACKAGE__ . ' connector created',getlogger(__PACKAGE__));
|
|
|
|
return $self;
|
|
|
|
}
|
|
|
|
sub _connectidentifier {
|
|
|
|
my $self = shift;
|
|
return $self->{f_dir};
|
|
|
|
}
|
|
|
|
sub tableidentifier {
|
|
|
|
my $self = shift;
|
|
my $tablename = shift;
|
|
return $tablename;
|
|
|
|
}
|
|
|
|
sub _columnidentifier {
|
|
|
|
my $self = shift;
|
|
my $columnname = shift;
|
|
|
|
return sanitize_column_name($columnname); #actually happens automatically by dbd::csv
|
|
|
|
}
|
|
|
|
sub get_tableidentifier {
|
|
|
|
my ($tablename,$db_dir) = @_;
|
|
if (defined $db_dir) {
|
|
return $db_dir . '.' . $tablename;
|
|
} else {
|
|
return $tablename;
|
|
}
|
|
|
|
}
|
|
|
|
sub getsafetablename {
|
|
|
|
my $self = shift;
|
|
my $tableidentifier = shift;
|
|
return lc($self->SUPER::getsafetablename($tableidentifier));
|
|
|
|
}
|
|
|
|
sub getdatabases {
|
|
|
|
my $self = shift;
|
|
|
|
local *DBDIR;
|
|
if (not opendir(DBDIR, $csv_path)) {
|
|
fileerror('cannot opendir ' . $csv_path . ': ' . $!,getlogger(__PACKAGE__));
|
|
return [];
|
|
}
|
|
my @dirs = grep { $_ ne '.' && $_ ne '..' && -d $csv_path . $_ } readdir(DBDIR);
|
|
closedir DBDIR;
|
|
my @databases = ();
|
|
foreach my $dir (@dirs) {
|
|
push @databases,$dir;
|
|
}
|
|
return \@databases;
|
|
|
|
}
|
|
|
|
sub _createdatabase {
|
|
|
|
my $self = shift;
|
|
my ($db_dir) = @_;
|
|
|
|
my $f_dir; # = _get_f_dir($db_dir);
|
|
if (length($db_dir) > 0) {
|
|
$f_dir = $csv_path . $db_dir . '/';
|
|
} else {
|
|
$f_dir = $csv_path;
|
|
}
|
|
|
|
dbinfo($self,'opening csv folder',getlogger(__PACKAGE__));
|
|
|
|
#mkdir $f_dir;
|
|
makepath($f_dir,\&fileerror,getlogger(__PACKAGE__));
|
|
|
|
#if (not -d $f_dir) {
|
|
# fileerror('cannot opendir ' . $f_dir . ': ' . $!,getlogger(__PACKAGE__));
|
|
# return;
|
|
#}
|
|
|
|
#local *DBDIR;
|
|
#if (not opendir(DBDIR, $f_dir)) {
|
|
# fileerror('cannot opendir ' . $f_dir . ': ' . $!,getlogger(__PACKAGE__));
|
|
# return;
|
|
#}
|
|
#closedir DBDIR;
|
|
|
|
#changemod($f_dir);
|
|
|
|
return $f_dir;
|
|
}
|
|
|
|
sub db_connect {
|
|
|
|
my $self = shift;
|
|
|
|
my ($db_dir,$csv_tables) = @_;
|
|
|
|
$self->SUPER::db_connect($db_dir,$csv_tables);
|
|
|
|
$self->{db_dir} = $db_dir;
|
|
$self->{csv_tables} = $csv_tables;
|
|
$self->{f_dir} = $self->_createdatabase($db_dir);
|
|
|
|
my $dbh_config = {
|
|
f_schema => undef,
|
|
#f_lock => 0, n/a in 0.26 yet?
|
|
cvs_eol => $default_csv_config->{eol},
|
|
cvs_sep_char => $default_csv_config->{sep_char},
|
|
cvs_quote_char => $default_csv_config->{quote_char},
|
|
cvs_escape_char => $default_csv_config->{escape_char},
|
|
PrintError => 0,
|
|
RaiseError => 0,
|
|
};
|
|
my $usetabledef = 0;
|
|
if (defined $csv_tables and ref $csv_tables eq 'HASH') {
|
|
$usetabledef = 1;
|
|
} else {
|
|
$dbh_config->{f_dir} = $self->{f_dir};
|
|
$dbh_config->{f_ext} = $csvextension . '/r';
|
|
}
|
|
|
|
my $dbh = DBI->connect ('dbi:CSV:','','',$dbh_config) or
|
|
dberror($self,'error connecting: ' . $self->{drh}->errstr(),getlogger(__PACKAGE__));
|
|
|
|
$dbh->{InactiveDestroy} = 1;
|
|
|
|
$dbh->{LongReadLen} = $LongReadLen;
|
|
$dbh->{LongTruncOk} = $LongTruncOk;
|
|
|
|
$self->{dbh} = $dbh;
|
|
|
|
if ($usetabledef) {
|
|
my @files = ();
|
|
foreach my $tablename (keys %$csv_tables) {
|
|
$dbh->{csv_tables}->{$tablename} = $csv_tables->{$tablename};
|
|
push @files,$csv_tables->{$tablename}->{file};
|
|
dbinfo($self,'using ' . $csv_tables->{$tablename}->{file},getlogger(__PACKAGE__));
|
|
}
|
|
$self->{files} = \@files;
|
|
} else {
|
|
my @tablenames = $self->_list_tables();
|
|
foreach my $tablename (@tablenames) {
|
|
$dbh->{csv_tables}->{$tablename} = { eol => $default_csv_config->{eol},
|
|
sep_char => $default_csv_config->{sep_char},
|
|
quote_char => $default_csv_config->{quote_char},
|
|
escape_char => $default_csv_config->{escape_char},
|
|
}
|
|
}
|
|
}
|
|
|
|
dbinfo($self,'connected',getlogger(__PACKAGE__));
|
|
|
|
}
|
|
|
|
|
|
sub _list_tables {
|
|
my $self = shift;
|
|
my @table_list;
|
|
|
|
eval {
|
|
@table_list = map { local $_ = $_; s/^\.\///g; $_; } $self->{dbh}->func('list_tables');
|
|
};
|
|
if ($@) {
|
|
my @tables;
|
|
eval {
|
|
@tables = $self->{dbh}->func("get_avail_tables") or return;
|
|
};
|
|
if ($@) {
|
|
dberror($self,'error listing csv tables: ' . $@,getlogger(__PACKAGE__));
|
|
} else {
|
|
foreach my $ref (@tables) {
|
|
if (defined $ref) {
|
|
if (ref $ref eq 'ARRAY') {
|
|
push @table_list, $ref->[2];
|
|
#} else {
|
|
# push @table_list, $ref;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return @table_list; #removeduplicates(\@table_list);
|
|
}
|
|
|
|
sub _db_disconnect {
|
|
|
|
my $self = shift;
|
|
|
|
$self->SUPER::_db_disconnect();
|
|
|
|
}
|
|
|
|
sub vacuum {
|
|
|
|
my $self = shift;
|
|
my $tablename = shift;
|
|
|
|
}
|
|
|
|
sub cleanupcsvdirs {
|
|
|
|
my (@remainingdbdirs) = @_;
|
|
local *DBDIR;
|
|
if (not opendir(DBDIR, $csv_path)) {
|
|
fileerror('cannot opendir ' . $csv_path . ': ' . $!,getlogger(__PACKAGE__));
|
|
return;
|
|
}
|
|
my @dirs = grep { $_ ne '.' && $_ ne '..' && -d $csv_path . $_ } readdir(DBDIR);
|
|
closedir DBDIR;
|
|
my @remainingdbdirectories = ();
|
|
foreach my $dirname (@remainingdbdirs) {
|
|
push @remainingdbdirectories,$csv_path . $dirname . '/';
|
|
}
|
|
foreach my $dir (@dirs) {
|
|
#print $file;
|
|
my $dirpath = $csv_path . $dir . '/';
|
|
if (not contains($dirpath,\@remainingdbdirectories)) {
|
|
#if (remove_tree($dirpath) == 0) {
|
|
# filewarn('cannot remove ' . $dirpath . ': ' . $!,getlogger(__PACKAGE__));
|
|
#}
|
|
remove_tree($dirpath, {
|
|
'keep_root' => 0,
|
|
'verbose' => 1,
|
|
'error' => \my $err });
|
|
if (@$err) {
|
|
for my $diag (@$err) {
|
|
my ($file, $message) = %$diag;
|
|
if ($file eq '') {
|
|
filewarn("cleanup: $message",getlogger(__PACKAGE__));
|
|
} else {
|
|
filewarn("problem unlinking $file: $message",getlogger(__PACKAGE__));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
sub getfieldnames {
|
|
|
|
my $self = shift;
|
|
my $tablename = shift;
|
|
|
|
my $fieldnames = [];
|
|
|
|
if (defined $self->{dbh}) {
|
|
|
|
my $query = 'SELECT * FROM ' . $self->tableidentifier($tablename) . ' LIMIT 1';
|
|
dbdebug($self,'getfieldnames: ' . $query,getlogger(__PACKAGE__));
|
|
my $sth = $self->{dbh}->prepare($query) or $self->_prepare_error($query);
|
|
$sth->execute() or $self->_execute_error($query,$sth,());
|
|
$fieldnames = $sth->{NAME};
|
|
$sth->finish();
|
|
|
|
}
|
|
|
|
return $fieldnames;
|
|
|
|
}
|
|
|
|
sub getprimarykeycols {
|
|
|
|
my $self = shift;
|
|
my $tablename = shift;
|
|
return [];
|
|
|
|
}
|
|
|
|
sub create_primarykey {
|
|
|
|
my $self = shift;
|
|
my ($tablename,$keycols,$fieldnames) = @_;
|
|
|
|
return 0;
|
|
}
|
|
sub create_indexes {
|
|
|
|
my $self = shift;
|
|
my ($tablename,$indexes,$keycols) = @_;
|
|
|
|
return 0;
|
|
}
|
|
|
|
sub _gettablefilename {
|
|
|
|
my $self = shift;
|
|
my $tablename = shift;
|
|
return $self->{f_dir} . $tablename . $csvextension;
|
|
|
|
}
|
|
|
|
sub copytablefile {
|
|
|
|
my $self = shift;
|
|
my $tablename = shift;
|
|
my $target = shift;
|
|
my $drop_header_line = shift;
|
|
my $tablefilename = $self->_gettablefilename($tablename);
|
|
$self->db_disconnect();
|
|
if (File::Copy::copy($tablefilename,$target)) {
|
|
`sed -i '1d' $target` if $drop_header_line;
|
|
dbinfo($self,"$tablefilename copied to $target",getlogger(__PACKAGE__));
|
|
} else {
|
|
dberror($self,"copy from $tablefilename to $target failed: $!",getlogger(__PACKAGE__));
|
|
}
|
|
|
|
}
|
|
|
|
sub create_texttable {
|
|
|
|
my $self = shift;
|
|
my ($tablename,$fieldnames,$keycols,$indexes,$truncate) = @_;
|
|
|
|
if (length($tablename) > 0 and defined $fieldnames and ref $fieldnames eq 'ARRAY') {
|
|
|
|
my $created = 0;
|
|
if ($self->table_exists($tablename) == 0) {
|
|
|
|
if (not exists $self->{dbh}->{csv_tables}->{$tablename}) {
|
|
$self->{dbh}->{csv_tables}->{$tablename} = { eol => $default_csv_config->{eol},
|
|
sep_char => $default_csv_config->{sep_char},
|
|
quote_char => $default_csv_config->{quote_char},
|
|
escape_char => $default_csv_config->{escape_char},
|
|
};
|
|
}
|
|
|
|
my $statement = 'CREATE TABLE ' . $self->tableidentifier($tablename) . ' (';
|
|
$statement .= join(' TEXT, ',map { local $_ = $_; $_ = $self->columnidentifier($_); $_; } @$fieldnames) . ' TEXT';
|
|
$statement .= ')';
|
|
|
|
$self->db_do($statement);
|
|
|
|
changemod($self->_gettablefilename($tablename));
|
|
|
|
texttablecreated($self,$tablename,getlogger(__PACKAGE__));
|
|
|
|
$created = 1;
|
|
} else {
|
|
my $fieldnamesfound = $self->getfieldnames($tablename);
|
|
if (not setcontains($fieldnames,$fieldnamesfound,1)) {
|
|
fieldnamesdiffer($self,$tablename,$fieldnames,$fieldnamesfound,getlogger(__PACKAGE__));
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
if (not $created and $truncate) {
|
|
$self->truncate_table($tablename);
|
|
}
|
|
return 1;
|
|
} else {
|
|
return 0;
|
|
}
|
|
|
|
}
|
|
|
|
sub multithreading_supported {
|
|
|
|
my $self = shift;
|
|
return 0;
|
|
|
|
}
|
|
|
|
sub rowblock_transactional {
|
|
|
|
my $self = shift;
|
|
return $rowblock_transactional;
|
|
|
|
}
|
|
|
|
sub truncate_table {
|
|
|
|
my $self = shift;
|
|
my $tablename = shift;
|
|
|
|
$self->db_do('DELETE FROM ' . $self->tableidentifier($tablename));
|
|
tabletruncated($self,$tablename,getlogger(__PACKAGE__));
|
|
|
|
}
|
|
|
|
sub table_exists {
|
|
|
|
my $self = shift;
|
|
my $tablename = shift;
|
|
|
|
if (defined $self->{dbh}) {
|
|
my @tables = $self->_list_tables();
|
|
return contains($tablename,\@tables);
|
|
}
|
|
|
|
return undef;
|
|
|
|
}
|
|
|
|
sub drop_table {
|
|
|
|
my $self = shift;
|
|
my $tablename = shift;
|
|
|
|
if ($self->table_exists($tablename) > 0) {
|
|
$self->db_do('DROP TABLE ' . $self->tableidentifier($tablename));
|
|
delete $self->{dbh}->{csv_tables}->{$tablename};
|
|
tabledropped($self,$tablename,getlogger(__PACKAGE__));
|
|
return 1;
|
|
}
|
|
return 0;
|
|
|
|
}
|
|
|
|
sub db_begin {
|
|
|
|
my $self = shift;
|
|
if (defined $self->{dbh}) {
|
|
dbdebug($self, "transactions not supported",getlogger(__PACKAGE__));
|
|
}
|
|
|
|
}
|
|
|
|
sub db_commit {
|
|
|
|
my $self = shift;
|
|
if (defined $self->{dbh}) {
|
|
dbdebug($self, "transactions not supported",getlogger(__PACKAGE__));
|
|
}
|
|
|
|
}
|
|
|
|
sub db_rollback {
|
|
|
|
my $self = shift;
|
|
if (defined $self->{dbh}) {
|
|
dbdebug($self, "transactions not supported",getlogger(__PACKAGE__));
|
|
}
|
|
|
|
}
|
|
|
|
sub db_do_begin {
|
|
|
|
my $self = shift;
|
|
my $query = shift;
|
|
#my $tablename = shift;
|
|
|
|
$self->SUPER::db_do_begin($query,$rowblock_transactional,@_);
|
|
|
|
}
|
|
|
|
sub db_get_begin {
|
|
|
|
my $self = shift;
|
|
my $query = shift;
|
|
#my $tablename = shift;
|
|
|
|
$self->SUPER::db_get_begin($query,$rowblock_transactional,@_);
|
|
|
|
}
|
|
|
|
sub db_finish {
|
|
|
|
my $self = shift;
|
|
my $rollback = shift;
|
|
|
|
$self->SUPER::db_finish($rowblock_transactional,$rollback);
|
|
|
|
}
|
|
|
|
sub xlsbin2csv {
|
|
|
|
my ($inputfile,$outputfile,$worksheetname,$sourcecharset) = @_;
|
|
|
|
return _convert_xlsbin2csv($inputfile,
|
|
$worksheetname,
|
|
$sourcecharset,
|
|
$outputfile,
|
|
'UTF-8',
|
|
$default_csv_config->{quote_char},
|
|
$default_csv_config->{escape_char},
|
|
$default_csv_config->{sep_char},
|
|
$default_csv_config->{eol});
|
|
|
|
}
|
|
|
|
sub _convert_xlsbin2csv {
|
|
|
|
my ($SourceFilename,$worksheet,$SourceCharset,$DestFilename,$DestCharset,$quote_char,$escape_char,$sep_char,$eol) = @_;
|
|
|
|
my $csvlinecount = 0;
|
|
|
|
xls2csvinfo('start converting ' . $SourceFilename . ' (worksheet ' . $worksheet . ') to ' . $DestFilename . ' ...',getlogger(__PACKAGE__));
|
|
|
|
$SourceCharset = 'UTF-8' unless $SourceCharset;
|
|
$DestCharset = $SourceCharset unless $DestCharset;
|
|
|
|
xls2csvinfo('reading ' . $SourceFilename . ' as ' . $SourceCharset,getlogger(__PACKAGE__));
|
|
|
|
my $XLS = IO::File->new();
|
|
if (not $XLS->open('<' . $SourceFilename)) {
|
|
fileerror('cannot open file ' . $SourceFilename . ': ' . $!,getlogger(__PACKAGE__));
|
|
return 0;
|
|
}
|
|
|
|
my $Formatter = Spreadsheet::ParseExcel::FmtUnicode->new(Unicode_Map => $SourceCharset);
|
|
|
|
my $parser = Spreadsheet::ParseExcel->new();
|
|
my $Book = $parser->parse($XLS,$Formatter); #$SourceFilename
|
|
|
|
if ( !defined $Book ) {
|
|
xls2csverror($parser->error(),getlogger(__PACKAGE__));
|
|
#die $parser->error(), ".\n";
|
|
$XLS->close();
|
|
return 0;
|
|
}
|
|
|
|
#my $Book = Spreadsheet::ParseExcel::Workbook->Parse($XLS, $Formatter) or xls2csverror('can\'t read spreadsheet',getlogger(__PACKAGE__));
|
|
|
|
my $Sheet;
|
|
if ($worksheet) {
|
|
|
|
#my $test = $Book->GetContent();
|
|
|
|
$Sheet = $Book->Worksheet($worksheet);
|
|
if (!defined $Sheet) {
|
|
xls2csverror('invalid spreadsheet',getlogger(__PACKAGE__));
|
|
return 0;
|
|
}
|
|
#unless ($O{'q'})
|
|
#{
|
|
# print qq|Converting the "$Sheet->{Name}" worksheet.\n|;
|
|
#}
|
|
xls2csvinfo('converting the ' . $Sheet->{Name} . ' worksheet',getlogger(__PACKAGE__));
|
|
} else {
|
|
($Sheet) = @{$Book->{Worksheet}};
|
|
if ($Book->{SheetCount}>1) {
|
|
#print qq|Multiple worksheets found. Will convert the "$Sheet->{Name}" worksheet.\n|;
|
|
xls2csvinfo('multiple worksheets found, converting ' . $Sheet->{Name},getlogger(__PACKAGE__));
|
|
}
|
|
}
|
|
|
|
unlink $DestFilename;
|
|
local *CSV;
|
|
if (not open(CSV,'>' . $DestFilename)) {
|
|
fileerror('cannot open file ' . $DestFilename . ': ' . $!,getlogger(__PACKAGE__));
|
|
$XLS->close();
|
|
return 0;
|
|
}
|
|
binmode CSV;
|
|
|
|
my $Csv = Text::CSV_XS->new({
|
|
'quote_char' => $quote_char,
|
|
'escape_char' => $escape_char,
|
|
'sep_char' => $sep_char,
|
|
'binary' => 1,
|
|
});
|
|
|
|
my $Recoder;
|
|
if ($DestCharset) {
|
|
$Recoder = Locale::Recode->new(from => $SourceCharset, to => $DestCharset);
|
|
}
|
|
|
|
for (my $Row = $Sheet->{MinRow}; defined $Sheet->{MaxRow} && $Row <= $Sheet->{MaxRow}; $Row++) {
|
|
my @Row;
|
|
for (my $Col = $Sheet->{MinCol}; defined $Sheet->{MaxCol} && $Col <= $Sheet->{MaxCol}; $Col++) {
|
|
my $Cell = $Sheet->{Cells}[$Row][$Col];
|
|
|
|
my $Value = "";
|
|
if ($Cell) {
|
|
$Value = $Cell->Value;
|
|
if ($Value eq 'GENERAL') {
|
|
# Sometimes numbers are read incorrectly as "GENERAL".
|
|
# In this case, the correct value should be in ->{Val}.
|
|
$Value = $Cell->{Val};
|
|
}
|
|
if ($DestCharset) {
|
|
$Recoder->recode($Value);
|
|
}
|
|
}
|
|
|
|
# We assume the line is blank if there is nothing in the first column.
|
|
last if $Col == $Sheet->{MinCol} and !$Value;
|
|
|
|
push(@Row,$Value);
|
|
}
|
|
|
|
next unless @Row;
|
|
|
|
my $Status = $Csv->combine(@Row);
|
|
|
|
if (!defined $Status) {
|
|
xls2csvwarn('csv error: ' . $Csv->error_input(),getlogger(__PACKAGE__));
|
|
}
|
|
|
|
if (defined $Status) {
|
|
print CSV $Csv->string();
|
|
if ($Row < $Sheet->{MaxRow}) {
|
|
print CSV $eol;
|
|
}
|
|
$csvlinecount++;
|
|
}
|
|
}
|
|
|
|
close CSV;
|
|
$XLS->close;
|
|
|
|
xls2csvinfo($csvlinecount . ' line(s) converted',getlogger(__PACKAGE__));
|
|
|
|
return $csvlinecount;
|
|
|
|
}
|
|
|
|
sub xlsxbin2csv {
|
|
|
|
my ($inputfile,$outputfile,$worksheetname) = @_;
|
|
|
|
return _convert_xlsxbin2csv($inputfile,
|
|
$worksheetname,
|
|
$outputfile,
|
|
'UTF-8',
|
|
$default_csv_config->{quote_char},
|
|
$default_csv_config->{escape_char},
|
|
$default_csv_config->{sep_char},
|
|
$default_csv_config->{eol});
|
|
|
|
}
|
|
|
|
sub _convert_xlsxbin2csv {
|
|
my ($SourceFilename,$worksheet,$DestFilename,$DestCharset,$quote_char,$escape_char,$sep_char,$eol) = @_;
|
|
|
|
my $csvlinecount = 0;
|
|
|
|
xls2csvinfo('start converting ' . $SourceFilename . ' (worksheet ' . $worksheet . ') to ' . $DestFilename . ' ...',getlogger(__PACKAGE__));
|
|
|
|
|
|
my $XLS = IO::File->new();
|
|
if (not $XLS->open('<' . $SourceFilename)) {
|
|
fileerror('cannot open file ' . $SourceFilename . ': ' . $!,getlogger(__PACKAGE__));
|
|
return 0;
|
|
} else {
|
|
$XLS->close();
|
|
}
|
|
|
|
#my $Formatter = Spreadsheet::ParseExcel::FmtUnicode->new(Unicode_Map => $SourceCharset);
|
|
|
|
my $reader = Excel::Reader::XLSX->new();
|
|
my $workbook = $reader->read_file($SourceFilename); #->parse($XLS,$Formatter); #$SourceFilename
|
|
|
|
my $SourceCharset = $workbook->{_reader}->encoding();
|
|
$DestCharset = $SourceCharset unless $DestCharset;
|
|
|
|
xls2csvinfo('reading ' . $SourceFilename . ' as ' . $SourceCharset,getlogger(__PACKAGE__));
|
|
|
|
if ( !defined $workbook ) {
|
|
xls2csverror($reader->error(),getlogger(__PACKAGE__));
|
|
#die $parser->error(), ".\n";
|
|
#$XLS->close();
|
|
return 0;
|
|
}
|
|
|
|
#my $Book = Spreadsheet::ParseExcel::Workbook->Parse($XLS, $Formatter) or xls2csverror('can\'t read spreadsheet',getlogger(__PACKAGE__));
|
|
|
|
my $sheet;
|
|
if ($worksheet) {
|
|
|
|
#my $test = $Book->GetContent();
|
|
|
|
$sheet = $workbook->worksheet($worksheet);
|
|
if (!defined $sheet) {
|
|
xls2csverror('invalid spreadsheet',getlogger(__PACKAGE__));
|
|
return 0;
|
|
}
|
|
#unless ($O{'q'})
|
|
#{
|
|
# print qq|Converting the "$Sheet->{Name}" worksheet.\n|;
|
|
#}
|
|
xls2csvinfo('converting the ' . $sheet->name() . ' worksheet',getlogger(__PACKAGE__));
|
|
} else {
|
|
$sheet = $workbook->worksheet(0);
|
|
if (@{$workbook->worksheets()} > 1) {
|
|
#print qq|Multiple worksheets found. Will convert the "$Sheet->{Name}" worksheet.\n|;
|
|
xls2csvinfo('multiple worksheets found, converting ' . $sheet->name(),getlogger(__PACKAGE__));
|
|
}
|
|
}
|
|
|
|
unlink $DestFilename;
|
|
local *CSV;
|
|
if (not open(CSV,'>' . $DestFilename)) {
|
|
fileerror('cannot open file ' . $DestFilename . ': ' . $!,getlogger(__PACKAGE__));
|
|
#$XLS->close();
|
|
return 0;
|
|
}
|
|
binmode CSV;
|
|
|
|
my $csv = Text::CSV_XS->new({
|
|
'quote_char' => $quote_char,
|
|
'escape_char' => $escape_char,
|
|
'sep_char' => $sep_char,
|
|
'binary' => 1,
|
|
});
|
|
|
|
my $Recoder;
|
|
if ($DestCharset) {
|
|
$Recoder = Locale::Recode->new(from => $SourceCharset, to => $DestCharset);
|
|
}
|
|
|
|
while ( my $row = $sheet->next_row() ) {
|
|
|
|
foreach my $value ($row->values()) {
|
|
$Recoder->recode($value);
|
|
}
|
|
|
|
my $status = $csv->combine($row->values());
|
|
|
|
if (!defined $status) {
|
|
xls2csvwarn('csv error: ' . $csv->error_input(),getlogger(__PACKAGE__));
|
|
}
|
|
|
|
if (defined $status) {
|
|
if ($row->row_number() > 0) {
|
|
print CSV $eol;
|
|
}
|
|
print CSV $csv->string();
|
|
$csvlinecount++;
|
|
}
|
|
}
|
|
|
|
close CSV;
|
|
#$XLS->close;
|
|
|
|
xls2csvinfo($csvlinecount . ' line(s) converted',getlogger(__PACKAGE__));
|
|
|
|
return $csvlinecount;
|
|
|
|
}
|
|
|
|
1;
|