#!/usr/local/ymir/perl/bin/perl use Unicode::Japanese; # setting my $maxgram = 5; my $printper = 0.01; my $re = Unicode::Japanese->new('^(µ­¹æ|̾»ì-¿ô)', 'euc')->get; my $uj = new Unicode::Japanese; my $data = ''; while(<>) { my @word = split(/\t/, $_); if(scalar(@word) <= 1) { $data .= ' '; next; } if($word[3] =~ m/$re/) { $data .= ' '; next; } $data .= $word[1]; } $data =~ s/ +/\n/g; my $re_char = qr/[\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3}|[\xf8-\xfb][\x80-\xbf]{4}|[\xfc-\xfd][\x80-\xbf]{5}/; my @ch = map { $_ ne '' ? $_ : () } split(/($re_char)/, $data); my @clist; my @oldch; my $charcount = 0; foreach my $ch (@ch) { next if($ch eq ''); if($ch eq "\n" or $ch eq ' ') { undef @oldch; next; } $charcount++; push(@oldch, $ch); shift @oldch if(scalar(@oldch) > $maxgram); for(my $i = scalar(@oldch) - 2; $i >= 0; $i--) { $clist[scalar(@oldch)-$i]{join('', @oldch[$i .. scalar(@oldch)-1])}++; } } for(my $i = 0; $i < scalar(@clist); $i++) { foreach my $ch (keys %{$clist[$i]}) { delete $clist[$i]{$ch} if($clist[$i]{$ch}/$charcount*100 < $printper); } } for(my $i = 0; $i < scalar(@clist); $i++) { foreach my $ch (sort {$clist[$i]{$b} <=> $clist[$i]{$a}} keys %{$clist[$i]}) { print $uj->set("$ch: $clist[$i]{$ch} " . sprintf('(%.2f)', $clist[$i]{$ch}/$charcount*100) . "\n")->euc; my $key = quotemeta($ch); my @keys = grep(m/^$key/, keys %{$clist[$i+1]}); foreach my $ch2 (sort {$clist[$i+1]{$b} <=> $clist[$i+1]{$a}} @keys) { print $uj->set(" $ch2: $clist[$i+1]{$ch2} " . sprintf('(%.2f)', $clist[$i+1]{$ch2}/$charcount*100) . "\n")->euc; } @keys = grep(m/$key$/, keys %{$clist[$i+1]}); foreach my $ch2 (sort {$clist[$i+1]{$b} <=> $clist[$i+1]{$a}} @keys) { print $uj->set(" $ch2: $clist[$i+1]{$ch2} " . sprintf('(%.2f)', $clist[$i+1]{$ch2}/$charcount*100) . "\n")->euc; } } print "\n\n====\n\n"; }