You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							377 lines
						
					
					
						
							10 KiB
						
					
					
				
			
		
		
	
	
							377 lines
						
					
					
						
							10 KiB
						
					
					
				| package NGCP::Panel::Utils::Utf8;
 | |
| 
 | |
| use 5.007003;
 | |
| 
 | |
| use strict;
 | |
| use warnings;
 | |
| 
 | |
| use Encode;
 | |
| use charnames ':full';
 | |
| 
 | |
| our $VERSION = "1.01";
 | |
| 
 | |
| ## no critic (Subroutines::ProhibitSubroutinePrototypes)
 | |
| 
 | |
| # A Regexp string to match valid UTF8 bytes
 | |
| # this info comes from page 78 of "The Unicode Standard 4.0"
 | |
| # published by the Unicode Consortium
 | |
| our $valid_utf8_regexp = <<'REGEX' ;
 | |
|         [\x{00}-\x{7f}]
 | |
|       | [\x{c2}-\x{df}][\x{80}-\x{bf}]
 | |
|       |         \x{e0} [\x{a0}-\x{bf}][\x{80}-\x{bf}]
 | |
|       | [\x{e1}-\x{ec}][\x{80}-\x{bf}][\x{80}-\x{bf}]
 | |
|       |         \x{ed} [\x{80}-\x{9f}][\x{80}-\x{bf}]
 | |
|       | [\x{ee}-\x{ef}][\x{80}-\x{bf}][\x{80}-\x{bf}]
 | |
|       |         \x{f0} [\x{90}-\x{bf}][\x{80}-\x{bf}]
 | |
|       | [\x{f1}-\x{f3}][\x{80}-\x{bf}][\x{80}-\x{bf}][\x{80}-\x{bf}]
 | |
|       |         \x{f4} [\x{80}-\x{8f}][\x{80}-\x{bf}][\x{80}-\x{bf}]
 | |
| REGEX
 | |
| 
 | |
| =head1 NAME
 | |
| 
 | |
| Test::utf8 - handy utf8 tests
 | |
| 
 | |
| =head1 SYNOPSIS
 | |
| 
 | |
|   # check the string is good
 | |
|   is_valid_string($string);   # check the string is valid
 | |
|   is_sane_utf8($string);      # check not double encoded
 | |
| 
 | |
|   # check the string has certain attributes
 | |
|   is_flagged_utf8($string1);   # has utf8 flag set
 | |
|   is_within_ascii($string2);   # only has ascii chars in it
 | |
|   isnt_within_ascii($string3); # has chars outside the ascii range
 | |
|   is_within_latin_1($string4); # only has latin-1 chars in it
 | |
|   isnt_within_ascii($string5); # has chars outside the latin-1 range
 | |
| 
 | |
| =head1 DESCRIPTION
 | |
| 
 | |
| This module is a collection of tests useful for dealing with utf8 strings in
 | |
| Perl.
 | |
| 
 | |
| This module has two types of tests: The validity tests check if a string is
 | |
| valid and not corrupt, whereas the characteristics tests will check that string
 | |
| has a given set of characteristics.
 | |
| 
 | |
| =head2 Validity Tests
 | |
| 
 | |
| =over
 | |
| 
 | |
| =item is_valid_string($string, $testname)
 | |
| 
 | |
| Checks if the string is "valid", i.e. this passes and returns true unless
 | |
| the internal utf8 flag hasn't been set on scalar that isn't made up of a valid
 | |
| utf-8 byte sequence.
 | |
| 
 | |
| This should I<never> happen and, in theory, this test should always pass. Unless
 | |
| you (or a module you use) goes monkeying around inside a scalar using Encode's
 | |
| private functions or XS code you shouldn't ever end up in a situation where
 | |
| you've got a corrupt scalar.  But if you do, and you do, then this function
 | |
| should help you detect the problem.
 | |
| 
 | |
| To be clear, here's an example of the error case this can detect:
 | |
| 
 | |
|   my $mark = "Mark";
 | |
|   my $leon = "L\x{e9}on";
 | |
|   is_valid_string($mark);  # passes, not utf-8
 | |
|   is_valid_string($leon);  # passes, not utf-8
 | |
| 
 | |
|   my $iloveny = "I \x{2665} NY";
 | |
|   is_valid_string($iloveny);      # passes, proper utf-8
 | |
| 
 | |
|   my $acme = "L\x{c3}\x{a9}on";
 | |
|   Encode::_utf8_on($acme);      # (please don't do things like this)
 | |
|   is_valid_string($acme);       # passes, proper utf-8 byte sequence upgraded
 | |
| 
 | |
|   Encode::_utf8_on($leon);      # (this is why you don't do things like this)
 | |
|   is_valid_string($leon);       # fails! the byte \x{e9} isn't valid utf-8
 | |
| 
 | |
| =cut
 | |
| 
 | |
| sub is_valid_string($;$)
 | |
| {
 | |
|   my $string = shift;
 | |
|   my $name = shift || "valid string test";
 | |
| 
 | |
|   # check we're a utf8 string - if not, we pass.
 | |
|   unless (Encode::is_utf8($string))
 | |
|     { return _pass($name) }
 | |
| 
 | |
|   # work out at what byte (if any) we have an invalid byte sequence
 | |
|   # and return the correct test result
 | |
|   my $pos = _invalid_sequence_at_byte($string);
 | |
|   if (_ok(!defined($pos), $name)) { return 1 }
 | |
|   _diag("malformed byte sequence starting at byte $pos");
 | |
|   return;
 | |
| }
 | |
| 
 | |
| sub _invalid_sequence_at_byte($)
 | |
| {
 | |
|   my $string = shift;
 | |
| 
 | |
|   # examine the bytes that make up the string (not the chars)
 | |
|   # by turning off the utf8 flag (no, use bytes doesn't
 | |
|   # work, we're dealing with a regexp)
 | |
|   Encode::_utf8_off($string);  ## no critic (ProtectPrivateSubs)
 | |
| 
 | |
|   # work out the index of the first non matching byte
 | |
|   my $result = $string =~ m/^($valid_utf8_regexp)*/ogx;
 | |
| 
 | |
|   # if we matched all the string return the empty list
 | |
|   my $pos = pos $string || 0;
 | |
|   return if $pos == length($string);
 | |
| 
 | |
|   # otherwise return the position we found
 | |
|   return $pos
 | |
| }
 | |
| 
 | |
| =item is_sane_utf8($string, $name)
 | |
| 
 | |
| This test fails if the string contains something that looks like it
 | |
| might be dodgy utf8, i.e. containing something that looks like the
 | |
| multi-byte sequence for a latin-1 character but perl hasn't been
 | |
| instructed to treat as such.  Strings that are not utf8 always
 | |
| automatically pass.
 | |
| 
 | |
| Some examples may help:
 | |
| 
 | |
|   # This will pass as it's a normal latin-1 string
 | |
|   is_sane_utf8("Hello L\x{e9}eon");
 | |
| 
 | |
|   # this will fail because the \x{c3}\x{a9} looks like the
 | |
|   # utf8 byte sequence for e-acute
 | |
|   my $string = "Hello L\x{c3}\x{a9}on";
 | |
|   is_sane_utf8($string);
 | |
| 
 | |
|   # this will pass because the utf8 is correctly interpreted as utf8
 | |
|   Encode::_utf8_on($string)
 | |
|   is_sane_utf8($string);
 | |
| 
 | |
| Obviously this isn't a hundred percent reliable.  The edge case where
 | |
| this will fail is where you have C<\x{c2}> (which is "LATIN CAPITAL
 | |
| LETTER WITH CIRCUMFLEX") or C<\x{c3}> (which is "LATIN CAPITAL LETTER
 | |
| WITH TILDE") followed by one of the latin-1 punctuation symbols.
 | |
| 
 | |
|   # a capital letter A with tilde surrounded by smart quotes
 | |
|   # this will fail because it'll see the "\x{c2}\x{94}" and think
 | |
|   # it's actually the utf8 sequence for the end smart quote
 | |
|   is_sane_utf8("\x{93}\x{c2}\x{94}");
 | |
| 
 | |
| However, since this hardly comes up this test is reasonably reliable
 | |
| in most cases.  Still, care should be applied in cases where dynamic
 | |
| data is placed next to latin-1 punctuation to avoid false negatives.
 | |
| 
 | |
| There exists two situations to cause this test to fail; The string
 | |
| contains utf8 byte sequences and the string hasn't been flagged as
 | |
| utf8 (this normally means that you got it from an external source like
 | |
| a C library; When Perl needs to store a string internally as utf8 it
 | |
| does it's own encoding and flagging transparently) or a utf8 flagged
 | |
| string contains byte sequences that when translated to characters
 | |
| themselves look like a utf8 byte sequence.  The test diagnostics tells
 | |
| you which is the case.
 | |
| 
 | |
| =cut
 | |
| 
 | |
| # build my regular expression out of the latin-1 bytes
 | |
| # NOTE: This won't work if our locale is nonstandard will it?
 | |
| my $re_bit = join "|", map { Encode::encode("utf8",chr($_)) } (127..255);
 | |
| 
 | |
| sub is_sane_utf8($;$)
 | |
| {
 | |
|   my $string = shift;
 | |
|   my $name   = shift || "sane utf8";
 | |
| 
 | |
|   # regexp in scalar context with 'g', meaning this loop will run for
 | |
|   # each match.  Should only have to run it once, but will redo if
 | |
|   # the failing case turns out to be allowed in %allowed.
 | |
|   while ($string =~ /($re_bit)/o)
 | |
|   {
 | |
|     # work out what the double encoded string was
 | |
|     my $bytes = $1;
 | |
| 
 | |
|     my $index = $+[0] - length($bytes);
 | |
|     my $codes = join '', map { sprintf '<%00x>', ord($_) } split //, $bytes;
 | |
| 
 | |
|     # what character does that represent?
 | |
|     my $char = Encode::decode("utf8",$bytes);
 | |
|     my $ord  = ord($char);
 | |
|     my $hex  = sprintf '%00x', $ord;
 | |
|     $char = charnames::viacode($ord);
 | |
| 
 | |
|     # print out diagnostic messages
 | |
|     _fail($name);
 | |
|     _diag(qq{Found dodgy chars "$codes" at char $index\n});
 | |
|     if (Encode::is_utf8($string))
 | |
|       { _diag("Chars in utf8 string look like utf8 byte sequence.") }
 | |
|     else
 | |
|       { _diag("String not flagged as utf8...was it meant to be?\n") }
 | |
|     _diag("Probably originally a $char char - codepoint $ord (dec),"
 | |
|          ." $hex (hex)\n");
 | |
| 
 | |
|     return 0;
 | |
|   }
 | |
| 
 | |
|   # got this far, must have passed.
 | |
|   _ok(1,$name);
 | |
|   return 1;
 | |
| }
 | |
| 
 | |
| # historic name of method; deprecated
 | |
| sub is_dodgy_utf8 { goto &is_sane_utf8 }
 | |
| 
 | |
| =back
 | |
| 
 | |
| =head2 String Characteristic Tests
 | |
| 
 | |
| These routines allow you to check the range of characters in a string.
 | |
| Note that these routines are blind to the actual encoding perl
 | |
| internally uses to store the characters, they just check if the
 | |
| string contains only characters that can be represented in the named
 | |
| encoding:
 | |
| 
 | |
| =over
 | |
| 
 | |
| =item is_within_ascii
 | |
| 
 | |
| Tests that a string only contains characters that are in the ASCII
 | |
| character set.
 | |
| 
 | |
| =cut
 | |
| 
 | |
| sub is_within_ascii($;$)
 | |
| {
 | |
|   my $string = shift;
 | |
|   my $name   = shift || "within ascii";
 | |
| 
 | |
|   # look for anything that isn't ascii or pass
 | |
|   $string =~ /([^\x{00}-\x{7f}])/ or return _pass($name);
 | |
| 
 | |
|   # explain why we failed
 | |
|   my $dec = ord($1);
 | |
|   my $hex = sprintf '%02x', $dec;
 | |
| 
 | |
|   _fail($name);
 | |
|   _diag("Char $+[0] not ASCII (it's $dec dec / $hex hex)");
 | |
| 
 | |
|   return 0;
 | |
| }
 | |
| 
 | |
| =item is_within_latin_1
 | |
| 
 | |
| Tests that a string only contains characters that are in latin-1.
 | |
| 
 | |
| =cut
 | |
| 
 | |
| sub is_within_latin_1($;$)
 | |
| {
 | |
|   my $string = shift;
 | |
|   my $name   = shift || "within latin-1";
 | |
| 
 | |
|   # look for anything that isn't ascii or pass
 | |
|   $string =~ /([^\x{00}-\x{ff}])/ or return _pass($name);
 | |
| 
 | |
|   # explain why we failed
 | |
|   my $dec = ord($1);
 | |
|   my $hex = sprintf '%x', $dec;
 | |
| 
 | |
|   _fail($name);
 | |
|   _diag("Char $+[0] not Latin-1 (it's $dec dec / $hex hex)");
 | |
| 
 | |
|   return 0;
 | |
| }
 | |
| 
 | |
| sub is_within_latin1 { goto &is_within_latin_1 }
 | |
| 
 | |
| =back
 | |
| 
 | |
| Simply check if a scalar is or isn't flagged as utf8 by perl's
 | |
| internals:
 | |
| 
 | |
| =over
 | |
| 
 | |
| =item is_flagged_utf8($string, $name)
 | |
| 
 | |
| Passes if the string is flagged by perl's internals as utf8, fails if
 | |
| it's not.
 | |
| 
 | |
| =cut
 | |
| 
 | |
| sub is_flagged_utf8
 | |
| {
 | |
|   my $string = shift;
 | |
|   my $name = shift || "flagged as utf8";
 | |
|   return _ok(Encode::is_utf8($string),$name);
 | |
| }
 | |
| 
 | |
| =item isnt_flagged_utf8($string,$name)
 | |
| 
 | |
| The opposite of C<is_flagged_utf8>, passes if and only if the string
 | |
| isn't flagged as utf8 by perl's internals.
 | |
| 
 | |
| Note: you can refer to this function as C<isn't_flagged_utf8> if you
 | |
| really want to.
 | |
| 
 | |
| =cut
 | |
| 
 | |
| sub isnt_flagged_utf8($;$)
 | |
| {
 | |
|   my $string = shift;
 | |
|   my $name   = shift || "not flagged as utf8";
 | |
|   return _ok(!Encode::is_utf8($string), $name);
 | |
| }
 | |
| 
 | |
| sub isn::t_flagged_utf8($;$)
 | |
| {
 | |
|   my $string = shift;
 | |
|   my $name   = shift || "not flagged as utf8";
 | |
|   return _ok(!Encode::is_utf8($string), $name);
 | |
| }
 | |
| 
 | |
| =back
 | |
| 
 | |
| =head1 AUTHOR
 | |
| 
 | |
| Written by Mark Fowler B<mark@twoshortplanks.com>
 | |
| 
 | |
| =head1 COPYRIGHT
 | |
| 
 | |
| Copyright Mark Fowler 2004,2012.  All rights reserved.
 | |
| 
 | |
| =head1 LICENSE
 | |
| 
 | |
| This program is free software; you can redistribute it
 | |
| and/or modify it under the same terms as Perl itself.
 | |
| 
 | |
| =head1 BUGS
 | |
| 
 | |
| None known.  Please report any to me via the CPAN RT system.  See
 | |
| http://rt.cpan.org/ for more details.
 | |
| 
 | |
| =head1 SEE ALSO
 | |
| 
 | |
| L<Test::DoubleEncodedEntities> for testing for double encoded HTML
 | |
| entities.
 | |
| 
 | |
| =cut
 | |
| 
 | |
| sub _ok
 | |
| {
 | |
|   return 1;
 | |
| }
 | |
| sub _diag
 | |
| {
 | |
|   return;
 | |
| }
 | |
| 
 | |
| sub _fail
 | |
| {
 | |
|   return 0;
 | |
| }
 | |
| 
 | |
| sub _pass
 | |
| {
 | |
|   return 1;
 | |
| }
 | |
| 
 | |
| 1;
 |