You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ngcp-panel/lib/NGCP/Panel/Utils/CSVSeparator.pm

557 lines
17 KiB

package NGCP::Panel::Utils::CSVSeparator;
use 5.008;
use strict;
use warnings;
use Carp qw(carp croak);
our $VERSION = '0.20';
use Exporter;
use base 'Exporter';
our @EXPORT_OK = qw(get_separator);
sub get_separator {
my %options = @_;
my $file_path = $options{path};
my $c = $options{c};
# check options
my $echo;
if ($options{echo}) {
$echo = 1;
print "\nDetecting field separator of $file_path\n";
$c->log->debug("Detecting field separator of $file_path") if $c;
}
my (@excluded, @included);
if (exists $options{exclude}) {
@excluded = @{$options{exclude}};
}
if (exists $options{include}) {
@included = @{$options{include}};
}
my ($lucky, $colon_timecol, $comma_decsep, $comma_groupsep);
if (exists $options{lucky} && $options{lucky} == 1) {
$lucky = 1;
print "Scalar context...\n\n" if $echo;
$c->log->debug("Scalar context...") if $c;
} else {
$colon_timecol = $comma_decsep = $comma_groupsep = 1;
print "List context...\n\n" if $echo;
$c->log->debug("List context...") if $c;
}
# options checked
# Default set of candidates
my @candidates = (',', ';', ':', '|', "\t");
my %survivors;
$survivors{$_} = [] foreach (@candidates);
if (@excluded > 0) {
foreach (@excluded) {
delete $survivors{$_};
_message('deleted', $_, $c) if $echo;
}
}
if (@included > 0) {
foreach (@included) {
if (length($_) == 1) {
$survivors{$_} = [];
}
_message('added', $_, $c) if $echo;
}
}
if (keys %survivors == 0) {
carp "No candidates left!";
return;
}
my $csv;
open ($csv, "<:crlf", $file_path) || croak "Couldn't open $file_path: $!";
my $record_count = 0; # if $echo
while (<$csv>) {
my $record = $_;
chomp $record;
if ($echo) {
$record_count++;
print "\nRecord #$record_count\n";
$c->log->debug("Record #$record_count: $record...") if $c;
}
foreach my $candidate (keys %survivors) {
_message('candidate', $candidate, $c) if $echo;
my $rex = qr/\Q$candidate\E/;
my $count = 0;
$count++ while ($record =~ /$rex/g);
print "Count: $count\n" if $echo;
$c->log->debug("Count: $count") if $c;
if ($count > 0 && !$lucky) {
push @{$survivors{$candidate}}, $count;
} elsif ($count == 0) {
delete $survivors{$candidate};
}
}
if (!$lucky) {
$colon_timecol = _regularity($record, 'timecol') if $colon_timecol;
$comma_decsep = _regularity($record, 'decsep') if $comma_decsep;
$comma_groupsep = _regularity($record, 'groupsep') if $comma_groupsep;
}
my @alive = keys %survivors;
my $survivors_count = @alive;
if ($survivors_count == 1) {
if ($echo) {
_message('detected', $alive[0], $c);
print "Returning control to caller...\n\n";
$c->log->debug("Returning control to caller...") if $c;
}
close $csv;
if (!$lucky) {
return @alive;
} else {
return $alive[0];
}
} elsif ($survivors_count == 0) {
carp "\nNo candidates left!\n";
return;
}
}
# More than 1 survivor. 2nd pass to determine count variability
if ($lucky) {
print "\nSeveral candidates left\n" if $echo;
$c->log->debug("Several candidates left") if $c;
carp "\nBad luck. Couldn't determine the separator of $file_path.\n";
$c->log->debug("Bad luck. Couldn't determine the separator of $file_path.") if $c;
return;
} else {
print "\nVariability:\n\n" if $echo;
$c->log->debug("Variability:") if $c;
my %std_dev;
foreach my $candidate (keys %survivors) {
my $mean = _mean(@{$survivors{$candidate}});
$std_dev{$candidate} = _std_dev($mean, @{$survivors{$candidate}});
if ($echo) {
_message('candidate', $candidate, $c);
$c->log->debug("Mean: $mean\tStd Dev: $std_dev{$candidate}") if $c;
print "Mean: $mean\tStd Dev: $std_dev{$candidate}\n\n";
}
}
$c->log->debug("Couldn't determine the separator") if $c;
print "Couldn't determine the separator\n" if $echo;
close $csv;
my @penalized;
if ($colon_timecol) {
$c->log->debug("Detected time column") if $c;
print "Detected time column\n" if $echo;
delete $survivors{':'};
push @penalized, ':';
}
if ($comma_decsep || $comma_groupsep) {
delete $survivors{','};
push @penalized, ',';
if ($echo && $comma_decsep) {
$c->log->debug("Detected comma-separated decimal numbers column") if $c;
print "\nDetected comma-separated decimal numbers column\n";
}
if ($echo && $comma_groupsep) {
$c->log->debug("Detected comma-grouped numbers column") if $c;
print "\nDetected comma-grouped numbers column\n";
}
}
my @alive = sort {$std_dev{$a} <=> $std_dev{$b}} keys %survivors;
push @alive, sort {$std_dev{$a} <=> $std_dev{$b}} @penalized;
if ($echo) {
$c->log->debug("Remaining candidates: ") if $c;
print "Remaining candidates: ";
foreach my $left (@alive) {
_message('left', $left, $c);
}
$c->log->debug("Returning control to caller...") if $c;
print "\n\nReturning control to caller...\n\n";
}
return @alive;
}
}
sub _mean {
my @array = @_;
my $sum = 0;
$sum += $_ foreach (@array);
my $mean = $sum / scalar(@array);
return $mean;
}
sub _std_dev {
my ($mean, @array) = @_;
my $sum = 0;
$sum += ($_ - $mean)**2 foreach (@array);
my $std_dev = sqrt( $sum / scalar(@array) );
return $std_dev;
}
sub _regularity {
my ($string, $kind) = @_;
my $time_rx = qr/
(?:^|(?<=\s|[T,;|\t]))
(?:[01]?[0-9]|2[0-3]) # hours
:
(?:[0-5][0-9]) # minutes
(?::[0-5][0-9])? # seconds
(?:
Z
|
\.\d+
|
(?:\+|-)
(?:[01]?[0-9]|2[0-3])
:
(?:[0-5][0-9])
)?
(?=$|\s|[,;|\t])
/x;
my $commadecsep_rx = qr/
(?:^|(?<=[^\d,.]))
(?:
[-+]?
(?:
\d{0,3}?(?:\.\d{3})*
|
\d+
)
,\d+
)
(?=$|[^\d,.])
/x;
my $commagroupsep_rx = qr/
(?:^|(?<=[^\d,.]))
(?:
[-+]?\d{0,3}?
(?:,\d{3})+
(?:\.\d+)?
)
(?=$|[^\d,.])
/x;
return 0 if ($kind eq 'timecol' && $string !~ /$time_rx/);
return 0 if ($kind eq 'decsep' && $string !~ /$commadecsep_rx/);
return 0 if ($kind eq 'groupsep' && $string !~ /$commagroupsep_rx/);
return 1;
}
sub _message {
my ($type, $candidate, $c) = @_;
my $char;
if (ord $candidate == 9) { # tab character
$char = "\\t";
} else {
$char = $candidate;
}
my %message = (
deleted => "Deleted $char from candidates list\n",
added => "Added $char to candidates list\n",
candidate => "Candidate: $char\t",
detected => "\nSeparator detected: $char\n",
left => " $char ",
);
$c->log->debug($message{$type}) if $c;
print $message{$type};
}
1;
__END__
=head1 NAME
Text::CSV::Separator - Determine the field separator of a CSV file
=head1 VERSION
Version 0.20 - November 2, 2008
=head1 SYNOPSIS
use Text::CSV::Separator qw(get_separator);
my @char_list = get_separator(
path => $csv_path,
exclude => $array1_ref, # optional
include => $array2_ref, # optional
echo => 1, # optional
);
my $separator;
if (@char_list) {
if (@char_list == 1) { # successful detection
$separator = $char_list[0];
} else { # several candidates passed the tests
# Some code here
} else { # no candidate passed the tests
# Some code here
}
# "I'm Feeling Lucky" alternative interface
# Don't forget to include the 'lucky' parameter
my $separator = get_separator(
path => $csv_path,
lucky => 1,
exclude => $array1_ref, # optional
include => $array2_ref, # optional
echo => 1, # optional
);
=head1 DESCRIPTION
This module provides a fast detection of the field separator character (also
called field delimiter) of a CSV file, or more generally, of a character
separated text file (also called delimited text file), and returns it ready
to use in a CSV parser (e.g., Text::CSV_XS, Tie::CSV_File, or
Text::CSV::Simple).
This may be useful to the vulnerable -and often ignored- population of
programmers who need to process automatically CSV files from different sources.
The default set of candidates contains the following characters:
',' ';' ':' '|' '\t'
The only required parameter is the CSV file path. Optionally, the user can
specify characters to be excluded or included in the list of candidates.
The routine returns an array containing the list of candidates that passed
the tests. If it succeeds, this array will contain only one value: the field
separator we are looking for. On the other hand, if no candidate survives
the tests, it will return an empty list.
The technique used is based on the following principle:
=over 8
=item *
For every line in the file, the number of instances of the separator
character acting as separators must be an integer constant > 0 , although
a line may also contain some instances of that character as
literal characters.
=item *
Most of the other candidates won't appear in a typical CSV line.
=back
As soon as a candidate misses a line, it will be removed from the candidates
list.
This is the first test done to the CSV file. In most cases, it will detect the
separator after processing the first few lines. In particular, if the file
contains a header line, one line will probably be enough to get the job done.
Processing will stop and return control to the caller as soon as the program
reaches a status of 1 single candidate (or 0 candidates left).
If the routine cannot determine the separator in the first pass, it will do
a second pass based on several heuristic techniques. It checks whether the
file has columns consisting of time values, comma-separated decimal numbers,
or numbers containing a comma as the group separator, which can lead to false
positives in files that don't have a header row. It also measures the
variability of the remaining candidates.
Of course, you can always create a CSV file capable of resisting the siege,
but this approach will work correctly in many cases. The possibility of
excluding some of the default candidates may help to resolve cases with
several possible winners.
The resulting array contains the list of possible separators sorted by their
likelihood, being the first array item the most probable separator.
The module also provides an alternative interface with a simpler syntax,
which can be handy if you think that the files your program will have
to deal with aren't too exotic. To use it you only have to add the
B<lucky =E<gt> 1> key-value pair to the parameters hash and the routine
will return a single value, so you can assign it directly to a scalar variable.
If no candidate survives the first pass, it will return C<undef>.
The code skips the 2nd pass, which is usually unnecessary, so the program
won't store counts and won't check any existing regularities. Hence, it will
run faster and will require less memory. This approach should be enough in
most cases.
=head1 FUNCTIONS
=over 4
=item C<get_separator(%options)>
Returns an array containing the field separator character (or characters, if
more than one candidate passed the tests) of a CSV file. In case no candidate
passes the tests, it returns an empty list.
The available parameters are:
=over 8
=item * C<path>
Required. The path to the CSV file.
=item * C<exclude>
Optional. Array containing characters to be excluded from the candidates list.
=item * C<include>
Optional. Array containing characters to be included in the candidates list.
=item * C<lucky>
Optional. If selected, get_separator will return one single character,
or C<undef> in case no separator is detected. Off by default.
=item * C<echo>
Optional. Writes to the standard output messages describing the actions
performed. Off by default.
This is useful to keep track of what's going on, especially for debugging
purposes.
=back
=back
=head1 EXPORT
None by default.
=head1 EXAMPLE
Consider the following scenario: Your program must process a batch of csv files,
and you know that the separator could be a comma, a semicolon or a tab.
You also know that one of the fields contains time values. This field will
provide a fixed number of colons that could mislead the detection code.
In this case, you should exclude the colon (and you can also exclude the other
default candidate not considered, the pipe character):
my @char_list = get_separator(
path => $csv_path,
exclude => [':', '|'],
);
if (@char_list) {
my $separator;
if (@char_list == 1) {
$separator = $char_list[0];
} else {
# Some code here
}
}
# Using the "I'm Feeling Lucky" interface:
my $separator = get_separator(
path => $csv_path,
lucky => 1,
exclude => [':', '|'],
);
=head1 MOTIVATION
Despite the popularity of XML, the CSV file format is still widely used
for data exchange between applications, because of its much lower overhead:
It requires much less bandwidth and storage space than XML, and it also has
a better performance under compression (see the References below).
Unfortunately, there is no formal specification of the CSV format.
The Microsoft Excel implementation is the most widely used and it has become
a I<de facto> standard, but the variations are almost endless.
One of the biggest annoyances of this format is that in most cases you don't
know a priori what is the field separator character used in a file.
CSV stands for "comma-separated values", but most of the spreadsheet
applications let the user select the field delimiter from a list of several
different characters when saving or exporting data to a CSV file.
Furthermore, in a Windows system, when you save a spreadsheet in Excel as a
CSV file, Excel will use as the field delimiter the default list separator of
your system's locale, which happens to be a semicolon for several European
languages. You can even customize this setting and use the list separator you
like. For these and other reasons, automating the processing of CSV files is a
risky task.
This module can be used to determine the separator character of a delimited
text file of any kind, but since the aforementioned ambiguity problems occur
mainly in CSV files, I decided to use the Text::CSV:: namespace.
=head1 REFERENCES
L<http://www.creativyst.com/Doc/Articles/CSV/CSV01.htm>
L<http://www.xml.com/pub/a/2004/12/15/deviant.html>
=head1 SEE ALSO
There's another module in CPAN for this task, Text::CSV::DetectSeparator,
which follows a different approach.
=head1 ACKNOWLEDGEMENTS
Many thanks to Xavier Noria for wise suggestions.
The author is also grateful to Thomas Zahreddin, Benjamin Erhart, Ferdinand Gassauer,
and Mario Krauss for valuable comments and bug reports.
=head1 AUTHOR
Enrique Nell, E<lt>perl_nell@telefonica.netE<gt>
=head1 COPYRIGHT
Copyright (C) 2006 by Enrique Nell.
=head1 LICENSE
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut