File: abbrev-convert.rb

package info (click to toggle)
skktools 1.3.3-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 1,380 kB
  • ctags: 349
  • sloc: sh: 11,551; ansic: 1,300; ruby: 1,049; perl: 798; lisp: 431; python: 241; makefile: 126; awk: 94; cpp: 73; sed: 1
file content (142 lines) | stat: -rwxr-xr-x 4,868 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/local/bin/ruby -Ke
# -*- coding: euc-jp -*-
## Copyright (C) 2005 MITA Yuusuke <clefs@mail.goo.ne.jp>
##
## Author: MITA Yuusuke <clefs@mail.goo.ne.jp>
## Maintainer: SKK Development Team <skk@ring.gr.jp>
## Version: $Id: abbrev-convert.rb,v 1.6 2013/05/26 09:47:48 skk-cvs Exp $
## Keywords: japanese, dictionary
## Last Modified: $Date: 2013/05/26 09:47:48 $
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2, or (at your option)
## any later version.

## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
## General Public License for more details.

## You should have received a copy of the GNU General Public License
## along with this program, see the file COPYING.  If not, write to the
## Free Software Foundation Inc., 51 Franklin St, Fifth Floor, Boston,
## MA 02110-1301, USA.
##
### Instruction:
## This script reads SKK-formatted dictionary from a file or stdin,
## extracts the pairs with alphabetic key and 'katakana' candidate (eg.
## "player /ץ쥤䡼/"), and then convert them into the other styles.
##
##    % abbrev-convert.rb SKK-JISYO.L | skkdic-expr2 > SKK-JISYO.waei
##
## Default action is to produce reversed pairs that can be used to
## convert katakana-words into original spellings,
## eg. "פ줤䡼 /player/".
##
##    % abbrev-convert.rb -k SKK-JISYO.L | skkdic-expr2 > SKK-JISYO.hira-kata
##
## If '-k' or '-K' option is given, the result is hiragana-katakana
## pairs such as "פ줤䡼 /ץ쥤䡼/". With '-K', the original
## key is appended as an annotation ("פ줤䡼 /ץ쥤䡼;player/").
##
##    % cat .skk-jisyo .skkinput-jisyo | abbrev-convert.rb -e SKK-JISYO.L | skkdic-expr2 > .skk-jisyo-abbrev
##
## '-e' given, it merely extracts alphabet-katakana (abbrev) pairs;
## you may wish to send the result to the dev-team to help the
## dictionary grow :-)
##
##
## '-s <num>' option suppresses words less than <num> letters (in Zenkaku).
## This can reduce flooding of homonyms caused by adding short words.
## 
## '-u' eliminates all the annotations.
##
## '-p' eliminates pairs with "" or "?" annotations that are suspected as 'wrong' words.
##
require 'jcode' if RUBY_VERSION.to_f < 1.9
#require 'kconv'
require 'optparse'
opt = OptionParser.new

mode = "waei"
unannotate = false
stem = 0
purge = false

opt.on('-e', 'extract alphabet-katakana pairs') { mode = "extract" }
opt.on('-w', 'output hiragana-alphabet pairs') { mode = "waei" }
opt.on('-k', 'output hiragana-katakana pairs') { mode = "hira-kata" }
opt.on('-K', 'same as -k, with original MIDASI as annotation') { mode = "hira-kata-with-spell" }
opt.on('-p', 'purge candidates marked with "" or "?"') { purge = true }
opt.on('-u', 'eliminate annotations') { unannotate = true }
opt.on('-s VAL', 'stem candidates equal or shorter than VAL letters') { |v| stem = v.to_i * 2 }

begin
  opt.parse!(ARGV)
rescue OptionParser::InvalidOption => e
  print "'#{$0} -h' for help.\n"
  exit 1
end

while gets
  next if $_ =~ /^[^a-zA-Z0-9]/
  tmp = $_.chop.split(" /", 2)
  midasi = tmp.shift
  tokens = tmp[0].sub(/\/\[.*/, "").split("/")
  candidates = Array.new

  tokens.each do |token|
    tmp = token.split(";")
    next if tmp[0] =~ /[^-=᡾]/
    next if tmp[0].length <= stem
    next if tmp[0] !~ /[-]/ # at least 1 valid letter
    next if purge && tmp[1] =~ //
    next if purge && tmp[1] =~ /\?$/
    candidates.push tmp
  end

  next if candidates.nitems < 1

  case mode
  when "extract"
    print "#{midasi} /"
    candidates.each do |word,annotation|
      if !unannotate && !annotation.nil?
	print "#{word};#{annotation}/"
      else
	print "#{word}/"
      end
    end
    print "\n"
  when "waei"
    candidates.each do |word,annotation|
      word = word.tr('-', '-').gsub(//, '').gsub(/[=᡾]/, '')
      if !unannotate && !annotation.nil?
	print "#{word} /#{midasi};#{annotation}/\n"
      else
	print "#{word} /#{midasi}/\n"
      end
    end
  when "hira-kata"
    candidates.each do |word,annotation|
      word_hira = word.tr('-', '-').gsub(//, '').gsub(/[=᡾]/, '')
      if !unannotate && !annotation.nil?
	print "#{word_hira} /#{word};#{annotation}/"
      else
	print "#{word_hira} /#{word}/"
      end
      print "\n"
    end
  when "hira-kata-with-spell"
    candidates.each do |word,annotation|
      word_hira = word.tr('-', '-').gsub(//, '').gsub(/[=᡾]/, '')
      if !unannotate && !annotation.nil?
	print "#{word_hira} /#{word};#{midasi}#{annotation}/"
      else
	print "#{word_hira} /#{word};#{midasi}/"
      end
      print "\n"
    end
  end
end