package langTools::Decode; use Getopt::Long; use File::Basename; use strict; use warnings; use Carp qw(cluck carp); use utf8; use Exporter; our ( $VERSION, @ISA, @EXPORT, @EXPORT_OK ); @ISA = qw(Exporter); @EXPORT = qw(&guess_body_encoding &guess_person_encoding &decode_para); our %Error_Types = ( # mac-sami converted as iconv -f mac -t utf8 "type01" => { "ª" => "š", "¥" => "Š", "º" => "ŧ", "µ" => "Ŧ", "∫" => "ŋ", "±" => "Ŋ", "¸" => "Ŋ", "π" => "đ", "∞" => "Đ", "Ω" => "ž", "∑" => "Ž", "∏" => "č", "¢" => "Č", }, # iso-ir-197 converted as iconv -f mac -t utf8 "type02" => { "·" => "á", "¡" => "Á", "≥" => "š", "≤" => "Š", "∏" => "ŧ", "µ" => "Ŧ", "±" => "ŋ", "Ø" => "Ŋ", "§" => "đ", "£" => "Đ", "∫" => "ž", "π" => "Ž", "¢" => "č", "°" => "Č", "Ê" => "æ", "Δ" => "Æ", "¯" => "ø", "ÿ" => "Ø", "Â" => "å", "≈" => "Å", "‰" => "ä", "ƒ" => "Ä", "ˆ" => "ö", "÷" => "Ö", }, "type03" => { "ƒ" => "š", # "√" => "ŋ", # "∂" => "đ", # "π" => "ž", # "ª" => "č", # "º" => "Č", # }, # winsami2 converted as iconv -f latin1 -t utf8 "type04" => { "\xc2\x9a" => "š", "\xc2\x8a" => "Š", "¼" => "ŧ", "º" => "Ŧ", "¹" => "ŋ", "¸" => "Ŋ", "˜" => "đ", "‰" => "Đ", "¿" => "ž", "¾" => "Ž", "\xc3\xb6" => "č", "\xc3\x96" => "Č", }, # iso-ir-197 converted as iconv -f latin1 -t utf8 "type05" => { "³" => "š", "²" => "Š", "¸" => "ŧ", "µ" => "Ŧ", "±" => "ŋ", "¯" => "Ŋ", "¤" => "đ", "£" => "Đ", "º" => "ž", "¹" => "Ž", "¢" => "č", "¡" => "Č", }, # mac-sami to latin1 "type06" => { "‡" => "á", "‡" => "á", "ç" => "Á", "»" => "š", "´" => "Š", "¼" => "ŧ", "µ" => "Ŧ", "º" => "ŋ", "±" => "Ŋ", "¹" => "đ", "°" => "Đ", "½" => "ž", "·" => "Ž", "¸" => "č", "¢" => "Č", "¾" => "æ", "®" => "Æ", "¿" => "ø", "¯" => "Ø", "Œ" => "å", "Ž" => "é", "Œ" => "å", "" => "Å", "Š" => "ä", "€" => "Ä", "š" => "ö", "…" => "Ö", "Ê" => " ", "¤" => "§", "Ò" => "“", "Ó" => "”", "ª " => "™ ", "ªÓ" => "™”", "Ã" => "√", "Ð" => "–", }, # found in boundcorpus/goldstandard/orig/sme/facta/GIEHTAGIRJI.correct.doc # and boundcorpus/goldstandard/orig/sme/facta/learerhefte_-_vaatmarksfugler.doc "type07" => { "ð" => "đ", "Ç" => "Č", "ç" => "č", "ó" => "š", "ý" => "ŧ", "þ" => "ž", }, # found in freecorpus/orig/sme/admin/sd/other_files/dc_00_1.doc # and freecorpus/orig/sme/admin/guovda/KS_02.12.99.doc # found in boundcorpus/orig/sme/bible/other_files/vitkan.pdf "type10" => { "ð" => "đ", "È" => "Č", "è" => "č", "¹" => "š", "¿" => "ŋ", "¾" => "ž", "¼" => "ŧ", "‚" => "Č", "„" => "č", "¹" => "ŋ", "˜" => "đ", "¿" => "ž", }, # found in titles in Min Áigi docs # double utf'ed letters "type11" => { "ï" => "ï", "á" => "á", "Ã\\?" => "Á", "Å¡" => "š", "¹" => "š", "ž" => "ž", "Ž" => "Ž", "«" => "«", "≤" => "«", "»" => "»", "≥" => "»", "Ã…" => "Å", "Ã¥" => "å", "Ã…" => "Å", "Ä\\?" => "č", "è" => "č", "ÄŒ" => "Č", "Ä‘" => "đ", "ð" => "đ", "Ä\\?" => "Đ", "ø" => "ø", "Ø" => "Ø", "ä" => "ö", "ä" => "ä", "Ä" => "Ä", "ŧ" => "ŧ", "é" => "é", "â€\\?" => "”", "æ" => "æ", "Å‹" => "ŋ", "•" => "•", }, "type12" => { "t1" => "ŧ", "T1" => "Ŧ", "s1" => "š", "S1" => "Š", "n1" => "ŋ", "N1" => "Ŋ", "d1" => "đ", "D1" => "Đ", "z1" => "ž", "Z1" => "Ž", "c1" => "č", "C1" => "Č", "ᆱ" => "«", "ï¾»" => "»", }, ); our $NO_ENCODING = 0; our $ERROR = -1; # Printing some test data, chars and their amounts our $Test = 0; # Guess text encoding of $para_ref. sub guess_body_encoding () { my ( $text ) = @_; my $max_hits = 0; my @text_array; my $error = 0; my $encoding = $NO_ENCODING; if ( !@text_array ) { @text_array = split( "\n", $text ); } for my $type ( sort( keys %Error_Types ) ) { my %freq; for my $line (@text_array) { foreach ( keys %{ $Error_Types{$type} } ) { my $key = $_; my $tmpline = $line; # ž is never found in the beginning or end of words if ( $key eq '½' and $type eq "type06" ) { $tmpline =~ s/(\s½|½\s)//g; } while ( $tmpline =~ /$key/g ) { $freq{$key}++; } } } my $num = ( keys %freq ); if ( %freq and ( ( $type eq "type11" ) or ( $type eq "type06" and $num > 1 ) or ( $type eq "type07" and $num > 1 ) or ( $type eq "type10" and $num > 1 ) or ( $type eq "type01" and $num > 1 ) or ( $type eq "type03" and $num > 3 ) or ( $type ne "type03" and $num > 3 ) ) ) { my $hits = 0; for my $key ( keys %freq ) { $hits += $freq{$key}; } if ( $hits >= $max_hits ) { $max_hits = $hits; $encoding = $type; } # type11 always wins over type06 as long as there are any hits for type11 if ( $encoding eq "type06" and $type eq "type11" and $num > 0 ) { $encoding = $type; } if ($Test) { print "body_encoding type is $type, encoding is $encoding, freq is ", %freq, " num is ", $num, " hits is ", $hits, "\n"; } } } return $encoding; } # Guess text encoding of $para_ref. sub guess_person_encoding () { my ( $lastname ) = @_; my $max_hits = 0; my $encoding = $NO_ENCODING; for my $type ( sort( keys %Error_Types ) ) { my %freq; foreach ( keys %{ $Error_Types{$type} } ) { my $key = $_; while ( $lastname =~ /$key/g ) { $freq{$key}++; } } my $num = ( keys %freq ); if ( %freq and ( ( $type eq "type11" ) or ( $type eq "type06" ) or ( $type eq "type07" ) or ( $type eq "type10" ) or ( $type eq "type01" ) or ( $type eq "type03" ) or ( $type ne "type03" ) ) ) { my $hits = 0; for my $key ( keys %freq ) { $hits += $freq{$key}; } if ( $hits >= $max_hits ) { $max_hits = $hits; $encoding = $type; } # type11 always wins over type06 as long as there are any hits for type11 if ( $encoding eq "type06" and $type eq "type11" and $num > 1) { $encoding = $type; } if ($Test) { print "type is $type, encoding is $encoding, freq is ", %freq, " num is ", $num, " hits is ", $hits, "\n"; } } } return $encoding; } sub decode_para () { my ( $para_ref, $encoding ) = @_; if ( !$encoding eq $NO_ENCODING ) { return; } # if ($Test) { # print "\n\npara_ref before $$para_ref\n\n"; # } foreach ( sort( keys %{ $Error_Types{$encoding} } ) ) { $$para_ref =~ s/$_/${$Error_Types{$encoding}}{$_}/g; } # if ($Test) { # print "\n\npara_ref after $encoding\n $$para_ref\n\n"; # } return 0; } sub read_file { my ( $file, $text_aref, $allow_nonutf ) = @_; if ( !open( FH, ") { if ( !utf8::is_utf8($_) ) { return "ERROR"; } push( @$text_aref, $_ ); } close(FH); return 0; } } 1; __END__ =head1 NAME samiChar::Decode.pm -- convert characters byte-wise to other characters. =head1 SYNOPSIS use samiChar::Decode; my $file = "file.txt"; my $outfile = "file.txt"; my $encoding; my $lang = "sme"; $encoding = &guess_encoding($file, $lang, $para_ref); &decode_file($file, $encoding, $outfile); $encoding = &guess_text_encoding($file, $outfile, $lang); &decode_text_file($file, $encoding, $outfile); =head1 DESCRIPTION samiChar::Decode.pm decodes characters to utf-8 byte-wise, using code tables. It is planned for decoding the Sámi characters in a situation, where the document is converted to utf-8 without knowing the original encoding. The decoding is implemented by using code table files, so the module can be used to other conversions as well. The output is however always utf-8. The module contains also a function for guessing the original encoding. It takes into account only the most common Sámi characters and their frequency in the text. =head2 Code tables Code tables are text files with the following format: Three space-separated columns: =over 4 =item Column #1 is the input char (in hex as 0xXX or 0xXXXX)) =item Column #2 is the Unicode char (in hex as 0xXXXX) =item Column #3 the Unicode name =back Most of the code tables are available at the Unicode Consortium: L Some of the code tables like samimac_roman and levi_winsam are composed from two code tables, the one that is used as input encoding and another that is used as the file was converted to utf-8. =over 4 =item samimac_roman: codetables samimac.txt and ROMAN.txt =item levi_winsam: codetables levi.txt and CP1258.txt =back levi.txt and samimac.txt are available under Trond's home page at: L. The codetables are composed using the function C<&combine_two_codings($coding1, $coding2, $outfile)> which is available in this package. These encodings are available: =over 4 =item latin6 => iso8859-10-1.txt =item plainroman => ROMAN.txt =item CP1258 => CP1258.txt =item iso_ir_197 => iso_ir_197.txt =item samimac_roman => samimac_roman.txt =item levi_winsam => levi_CP1258.txt =item winsam => winsam.txt =item 8859-4 => 8859-4.txt =back =head2 Guessing the input encoding The original input encoding is guessed by examining the text and searching the most common characters. The unicode characters in hex are listed in hash C<%Sami_Chars> for Northern Sámi for example. The uncommented characters are the ones that take part into guessing the encoding. The encodings are listed in the hash C<%Charfiles>, they are tested one at the time. The occurences of the selected characters in that encoding are counted and the one with most occurences is returned. There is a place for more statistical analysis, but this simple test worked for me. If there is no certain amount of characters found, the test returns -1, which means that the characters should be already correctly utf-8 encoded. =head1 BUGS There may be mappings that are missing from the list of code tables. =head1 AUTHOR Saara Huhmarniemi