#!/usr/local/ymir/perl/bin/perl

use Unicode::Japanese;


# setting
my $maxgram = 5;
my $printper = 0.01;

my $re = Unicode::Japanese->new('^(記号|名詞-数)', 'euc')->get;
my $uj = new Unicode::Japanese;


my $data = '';

while(<>) {
  my @word = split(/\t/, $_);
  if(scalar(@word) <= 1) {
    $data .= ' ';
    next;
  }
  if($word[3] =~ m/$re/) {
    $data .= ' ';
    next;
  }
  $data .= $word[1];
}
$data =~ s/ +/\n/g;

my $re_char = qr/[\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3}|[\xf8-\xfb][\x80-\xbf]{4}|[\xfc-\xfd][\x80-\xbf]{5}/;

my @ch = map { $_ ne '' ? $_ : () } split(/($re_char)/, $data);

my @clist;
my @oldch;
my $charcount = 0;
foreach my $ch (@ch) {
  next if($ch eq '');
  if($ch eq "\n" or $ch eq ' ') {
    undef @oldch;
    next;
  }
  $charcount++;
  push(@oldch, $ch);
  shift @oldch if(scalar(@oldch) > $maxgram);
  for(my $i = scalar(@oldch) - 2; $i >= 0; $i--) {
    $clist[scalar(@oldch)-$i]{join('', @oldch[$i .. scalar(@oldch)-1])}++;
  }
}

for(my $i = 0; $i < scalar(@clist); $i++) {
  foreach my $ch (keys %{$clist[$i]}) {
    delete $clist[$i]{$ch} if($clist[$i]{$ch}/$charcount*100 < $printper);
  }
}
for(my $i = 0; $i < scalar(@clist); $i++) {
  foreach my $ch (sort {$clist[$i]{$b} <=> $clist[$i]{$a}} keys %{$clist[$i]}) {
    print $uj->set("$ch: $clist[$i]{$ch} " . sprintf('(%.2f)', $clist[$i]{$ch}/$charcount*100) . "\n")->euc;
    my $key = quotemeta($ch);
    
    my @keys = grep(m/^$key/, keys %{$clist[$i+1]});
    foreach my $ch2 (sort {$clist[$i+1]{$b} <=> $clist[$i+1]{$a}} @keys) {
      print $uj->set("  $ch2: $clist[$i+1]{$ch2} " . sprintf('(%.2f)', $clist[$i+1]{$ch2}/$charcount*100) . "\n")->euc;
    }
    @keys = grep(m/$key$/, keys %{$clist[$i+1]});
    foreach my $ch2 (sort {$clist[$i+1]{$b} <=> $clist[$i+1]{$a}} @keys) {
      print $uj->set("  $ch2: $clist[$i+1]{$ch2} " . sprintf('(%.2f)', $clist[$i+1]{$ch2}/$charcount*100) . "\n")->euc;
    }
  }
  print "\n\n====\n\n";
}