package NGCP::Panel::Utils::CSVSeparator; use 5.008; use strict; use warnings; use Carp qw(carp croak); our $VERSION = '0.20'; use Exporter; use base 'Exporter'; our @EXPORT_OK = qw(get_separator); sub get_separator { my %options = @_; my $file_path = $options{path}; my $c = $options{c}; # check options my $echo; if ($options{echo}) { $echo = 1; print "\nDetecting field separator of $file_path\n"; $c->log->debug("Detecting field separator of $file_path") if $c; } my (@excluded, @included); if (exists $options{exclude}) { @excluded = @{$options{exclude}}; } if (exists $options{include}) { @included = @{$options{include}}; } my ($lucky, $colon_timecol, $comma_decsep, $comma_groupsep); if (exists $options{lucky} && $options{lucky} == 1) { $lucky = 1; print "Scalar context...\n\n" if $echo; $c->log->debug("Scalar context...") if $c; } else { $colon_timecol = $comma_decsep = $comma_groupsep = 1; print "List context...\n\n" if $echo; $c->log->debug("List context...") if $c; } # options checked # Default set of candidates my @candidates = (',', ';', ':', '|', "\t"); my %survivors; $survivors{$_} = [] foreach (@candidates); if (@excluded > 0) { foreach (@excluded) { delete $survivors{$_}; _message('deleted', $_, $c) if $echo; } } if (@included > 0) { foreach (@included) { if (length($_) == 1) { $survivors{$_} = []; } _message('added', $_, $c) if $echo; } } if (keys %survivors == 0) { carp "No candidates left!"; return; } my $csv; open ($csv, "<:crlf", $file_path) || croak "Couldn't open $file_path: $!"; my $record_count = 0; # if $echo while (<$csv>) { my $record = $_; chomp $record; if ($echo) { $record_count++; print "\nRecord #$record_count\n"; $c->log->debug("Record #$record_count: $record...") if $c; } foreach my $candidate (keys %survivors) { _message('candidate', $candidate, $c) if $echo; my $rex = qr/\Q$candidate\E/; my $count = 0; $count++ while ($record =~ /$rex/g); print "Count: $count\n" if $echo; $c->log->debug("Count: $count") if $c; if ($count > 0 && !$lucky) { push @{$survivors{$candidate}}, $count; } elsif ($count == 0) { delete $survivors{$candidate}; } } if (!$lucky) { $colon_timecol = _regularity($record, 'timecol') if $colon_timecol; $comma_decsep = _regularity($record, 'decsep') if $comma_decsep; $comma_groupsep = _regularity($record, 'groupsep') if $comma_groupsep; } my @alive = keys %survivors; my $survivors_count = @alive; if ($survivors_count == 1) { if ($echo) { _message('detected', $alive[0], $c); print "Returning control to caller...\n\n"; $c->log->debug("Returning control to caller...") if $c; } close $csv; if (!$lucky) { return @alive; } else { return $alive[0]; } } elsif ($survivors_count == 0) { carp "\nNo candidates left!\n"; return; } } # More than 1 survivor. 2nd pass to determine count variability if ($lucky) { print "\nSeveral candidates left\n" if $echo; $c->log->debug("Several candidates left") if $c; carp "\nBad luck. Couldn't determine the separator of $file_path.\n"; $c->log->debug("Bad luck. Couldn't determine the separator of $file_path.") if $c; return; } else { print "\nVariability:\n\n" if $echo; $c->log->debug("Variability:") if $c; my %std_dev; foreach my $candidate (keys %survivors) { my $mean = _mean(@{$survivors{$candidate}}); $std_dev{$candidate} = _std_dev($mean, @{$survivors{$candidate}}); if ($echo) { _message('candidate', $candidate, $c); $c->log->debug("Mean: $mean\tStd Dev: $std_dev{$candidate}") if $c; print "Mean: $mean\tStd Dev: $std_dev{$candidate}\n\n"; } } $c->log->debug("Couldn't determine the separator") if $c; print "Couldn't determine the separator\n" if $echo; close $csv; my @penalized; if ($colon_timecol) { $c->log->debug("Detected time column") if $c; print "Detected time column\n" if $echo; delete $survivors{':'}; push @penalized, ':'; } if ($comma_decsep || $comma_groupsep) { delete $survivors{','}; push @penalized, ','; if ($echo && $comma_decsep) { $c->log->debug("Detected comma-separated decimal numbers column") if $c; print "\nDetected comma-separated decimal numbers column\n"; } if ($echo && $comma_groupsep) { $c->log->debug("Detected comma-grouped numbers column") if $c; print "\nDetected comma-grouped numbers column\n"; } } my @alive = sort {$std_dev{$a} <=> $std_dev{$b}} keys %survivors; push @alive, sort {$std_dev{$a} <=> $std_dev{$b}} @penalized; if ($echo) { $c->log->debug("Remaining candidates: ") if $c; print "Remaining candidates: "; foreach my $left (@alive) { _message('left', $left, $c); } $c->log->debug("Returning control to caller...") if $c; print "\n\nReturning control to caller...\n\n"; } return @alive; } } sub _mean { my @array = @_; my $sum = 0; $sum += $_ foreach (@array); my $mean = $sum / scalar(@array); return $mean; } sub _std_dev { my ($mean, @array) = @_; my $sum = 0; $sum += ($_ - $mean)**2 foreach (@array); my $std_dev = sqrt( $sum / scalar(@array) ); return $std_dev; } sub _regularity { my ($string, $kind) = @_; my $time_rx = qr/ (?:^|(?<=\s|[T,;|\t])) (?:[01]?[0-9]|2[0-3]) # hours : (?:[0-5][0-9]) # minutes (?::[0-5][0-9])? # seconds (?: Z | \.\d+ | (?:\+|-) (?:[01]?[0-9]|2[0-3]) : (?:[0-5][0-9]) )? (?=$|\s|[,;|\t]) /x; my $commadecsep_rx = qr/ (?:^|(?<=[^\d,.])) (?: [-+]? (?: \d{0,3}?(?:\.\d{3})* | \d+ ) ,\d+ ) (?=$|[^\d,.]) /x; my $commagroupsep_rx = qr/ (?:^|(?<=[^\d,.])) (?: [-+]?\d{0,3}? (?:,\d{3})+ (?:\.\d+)? ) (?=$|[^\d,.]) /x; return 0 if ($kind eq 'timecol' && $string !~ /$time_rx/); return 0 if ($kind eq 'decsep' && $string !~ /$commadecsep_rx/); return 0 if ($kind eq 'groupsep' && $string !~ /$commagroupsep_rx/); return 1; } sub _message { my ($type, $candidate, $c) = @_; my $char; if (ord $candidate == 9) { # tab character $char = "\\t"; } else { $char = $candidate; } my %message = ( deleted => "Deleted $char from candidates list\n", added => "Added $char to candidates list\n", candidate => "Candidate: $char\t", detected => "\nSeparator detected: $char\n", left => " $char ", ); $c->log->debug($message{$type}) if $c; print $message{$type}; } 1; __END__ =head1 NAME Text::CSV::Separator - Determine the field separator of a CSV file =head1 VERSION Version 0.20 - November 2, 2008 =head1 SYNOPSIS use Text::CSV::Separator qw(get_separator); my @char_list = get_separator( path => $csv_path, exclude => $array1_ref, # optional include => $array2_ref, # optional echo => 1, # optional ); my $separator; if (@char_list) { if (@char_list == 1) { # successful detection $separator = $char_list[0]; } else { # several candidates passed the tests # Some code here } else { # no candidate passed the tests # Some code here } # "I'm Feeling Lucky" alternative interface # Don't forget to include the 'lucky' parameter my $separator = get_separator( path => $csv_path, lucky => 1, exclude => $array1_ref, # optional include => $array2_ref, # optional echo => 1, # optional ); =head1 DESCRIPTION This module provides a fast detection of the field separator character (also called field delimiter) of a CSV file, or more generally, of a character separated text file (also called delimited text file), and returns it ready to use in a CSV parser (e.g., Text::CSV_XS, Tie::CSV_File, or Text::CSV::Simple). This may be useful to the vulnerable -and often ignored- population of programmers who need to process automatically CSV files from different sources. The default set of candidates contains the following characters: ',' ';' ':' '|' '\t' The only required parameter is the CSV file path. Optionally, the user can specify characters to be excluded or included in the list of candidates. The routine returns an array containing the list of candidates that passed the tests. If it succeeds, this array will contain only one value: the field separator we are looking for. On the other hand, if no candidate survives the tests, it will return an empty list. The technique used is based on the following principle: =over 8 =item * For every line in the file, the number of instances of the separator character acting as separators must be an integer constant > 0 , although a line may also contain some instances of that character as literal characters. =item * Most of the other candidates won't appear in a typical CSV line. =back As soon as a candidate misses a line, it will be removed from the candidates list. This is the first test done to the CSV file. In most cases, it will detect the separator after processing the first few lines. In particular, if the file contains a header line, one line will probably be enough to get the job done. Processing will stop and return control to the caller as soon as the program reaches a status of 1 single candidate (or 0 candidates left). If the routine cannot determine the separator in the first pass, it will do a second pass based on several heuristic techniques. It checks whether the file has columns consisting of time values, comma-separated decimal numbers, or numbers containing a comma as the group separator, which can lead to false positives in files that don't have a header row. It also measures the variability of the remaining candidates. Of course, you can always create a CSV file capable of resisting the siege, but this approach will work correctly in many cases. The possibility of excluding some of the default candidates may help to resolve cases with several possible winners. The resulting array contains the list of possible separators sorted by their likelihood, being the first array item the most probable separator. The module also provides an alternative interface with a simpler syntax, which can be handy if you think that the files your program will have to deal with aren't too exotic. To use it you only have to add the B 1> key-value pair to the parameters hash and the routine will return a single value, so you can assign it directly to a scalar variable. If no candidate survives the first pass, it will return C. The code skips the 2nd pass, which is usually unnecessary, so the program won't store counts and won't check any existing regularities. Hence, it will run faster and will require less memory. This approach should be enough in most cases. =head1 FUNCTIONS =over 4 =item C Returns an array containing the field separator character (or characters, if more than one candidate passed the tests) of a CSV file. In case no candidate passes the tests, it returns an empty list. The available parameters are: =over 8 =item * C Required. The path to the CSV file. =item * C Optional. Array containing characters to be excluded from the candidates list. =item * C Optional. Array containing characters to be included in the candidates list. =item * C Optional. If selected, get_separator will return one single character, or C in case no separator is detected. Off by default. =item * C Optional. Writes to the standard output messages describing the actions performed. Off by default. This is useful to keep track of what's going on, especially for debugging purposes. =back =back =head1 EXPORT None by default. =head1 EXAMPLE Consider the following scenario: Your program must process a batch of csv files, and you know that the separator could be a comma, a semicolon or a tab. You also know that one of the fields contains time values. This field will provide a fixed number of colons that could mislead the detection code. In this case, you should exclude the colon (and you can also exclude the other default candidate not considered, the pipe character): my @char_list = get_separator( path => $csv_path, exclude => [':', '|'], ); if (@char_list) { my $separator; if (@char_list == 1) { $separator = $char_list[0]; } else { # Some code here } } # Using the "I'm Feeling Lucky" interface: my $separator = get_separator( path => $csv_path, lucky => 1, exclude => [':', '|'], ); =head1 MOTIVATION Despite the popularity of XML, the CSV file format is still widely used for data exchange between applications, because of its much lower overhead: It requires much less bandwidth and storage space than XML, and it also has a better performance under compression (see the References below). Unfortunately, there is no formal specification of the CSV format. The Microsoft Excel implementation is the most widely used and it has become a I standard, but the variations are almost endless. One of the biggest annoyances of this format is that in most cases you don't know a priori what is the field separator character used in a file. CSV stands for "comma-separated values", but most of the spreadsheet applications let the user select the field delimiter from a list of several different characters when saving or exporting data to a CSV file. Furthermore, in a Windows system, when you save a spreadsheet in Excel as a CSV file, Excel will use as the field delimiter the default list separator of your system's locale, which happens to be a semicolon for several European languages. You can even customize this setting and use the list separator you like. For these and other reasons, automating the processing of CSV files is a risky task. This module can be used to determine the separator character of a delimited text file of any kind, but since the aforementioned ambiguity problems occur mainly in CSV files, I decided to use the Text::CSV:: namespace. =head1 REFERENCES L L =head1 SEE ALSO There's another module in CPAN for this task, Text::CSV::DetectSeparator, which follows a different approach. =head1 ACKNOWLEDGEMENTS Many thanks to Xavier Noria for wise suggestions. The author is also grateful to Thomas Zahreddin, Benjamin Erhart, Ferdinand Gassauer, and Mario Krauss for valuable comments and bug reports. =head1 AUTHOR Enrique Nell, Eperl_nell@telefonica.netE =head1 COPYRIGHT Copyright (C) 2006 by Enrique Nell. =head1 LICENSE This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut