File: i2myspell

package info (click to toggle)
magyarispell 1.2%2Brepack-2
links: PTS, VCS
area: main
in suites: wheezy
size: 4,952 kB
sloc: sh: 1,032; awk: 404; makefile: 265; sed: 153; lisp: 102
file content (224 lines) | stat: -rwxr-xr-x 5,002 bytes
#!/bin/sh
# i2myspell
# Ispell->MySpell ragozsi tblzat s sztr konvertlsa
# Ispell->MySpell affix table and dictionary converter
#
#	Version 2.0.0 (2003.08.21.)
#	Version 1.7.0 (2003.02.20.)
#	Version 1.6.2 (2003.02.04.)
#	Version 1.6.1 (2002.12.12.)
#	Version 1.6.0 (2002.9.22.)
#	Version 1.5.1 (2002.4.17.)
#
#	(c) Copyright 2002 Lszl Nmeth, All Rights Reserved
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the BSD License (see http://www.opensource.org)

# configuration

# AWK and ISPELL paths

# configure GNU AWK (run Magyar Ispell distribution's config file or set AWK here
test -f ./config && . ./config || AWK="/usr/bin/awk"

# ispell's name is ispell.orig after mispell installation
test -f /usr/bin/ispell.orig && ISPELL=ispell.orig || ISPELL=ispell

# END of configuration

case "$#" in
0|1)
	echo "i2myspell ispell->myspell affix table and 
dictionary converter

to generate Myspell aff file:
 1. i2myspell ispell_.aff_file affix_file_heading [upper lower]

to generate Myspell dic file:
 2. i2myspell -d ispell_dict_file

Example: 
1. i2myspell magyar.aff hu_HU_heading.txt A-Z a-z > hu_HU.aff
2. i2myspell -d magyar.dict > hu_HU.dic

"
	exit
	;;
esac

# set correct sort, because sorting with Hungarian locale is wrong
LC_SAVE=$LC_ALL
LC_ALL="C"
export LC_ALL

case "$1" in
-d)
	# dict file uniq
#	sed 's#/# #' $2 | sort -r -k 1 | uniq | grep -v ^$ |
	sed 's#/# #' $2 | sort -r | uniq | grep -v '^[ 	]*$' |
	grep -v '^\[[^]]*\]$' | grep -v '^\[[^]]*\]/.*$' |
	$AWK '/\t/{print "\n" $0; next}
    { 
    if (p!=$1) { 
	printf "\n%s", $0; 
	p=$1 } 
    else { 
	if ($2!="") printf "/%s", $2; 
    }
}' | grep -v ^$ | sed 's#/##g
s# #/#' |
# uniq duplicated flags, and parse spec. syntax (-FLAG)
$AWK '
BEGIN { FS = "/" }
/^__/ || /\t/ { 
    # words with roots, for example __vizek/vz, handled by Magyar MySpell
    print $0;
    next
}
{
	printf "%s", $1;
	op = NR;
	if (NF > 1) {
		printf "/";
		ln = length($2);
		for (i = 1; i <= ln; i++) {
			c = substr($2, i, 1);
			if (c == "-") {
				op = 0;
			} else {
				b[c] = op;
				op = NR;
			}
		}
		for (i = 1; i <= ln; i++) {
			c = substr($2, i, 1);
			if (b[c] == NR) {
				printf "%s", c;
				b[c] = 0;
			}
		}
	}
	print "";
}' | # tee SSSS | 
# tabultoros sorok htra
awk '
function key(s) {
	match(s, "^[^	/]*")
	return substr(s, 1, RLENGTH)
}
/^\t/{ next }
/\t/{
	keyloc = key($0)
	if (keyloc != keyprev) {
		print tablines
		tablines = $0
		keyprev = keyloc
	} else {
		if ((tablines == "") || ($2~"^=")) tablines = $0;
		else if (tablines !~ "^=") tablines = tablines "\n" $0
	}
	next
}
{ 
	keyloc = key($0)
	if (keyloc != keyprev) {
		print tablines
		tablines = ""
		keyprev = keyloc
	}
	print $0
}
END{print tablines}' | # tee SSSS2 |
# morfolgiai jegyek sszevonsa
awk '
/\t/{
	if (tab==0) {
    	print "\t" $2
    	tab=1
	} else {
    	print stem "\t" $2
	} next
}
NR>1 {print ""}
{printf "%s",$0; stem=$1; tab=0}
' | grep -v ^$ |
        # [vrb] s egyb szfaji jelek kiszrse
        # morfolgiai lers eltt tallhat egyenlsgjel trlse
        sed 's/^\[[^]]*\]//
s/^\[[^]]*\]//
s/	=/	/' >/tmp/i2my$$.1

	cat /tmp/i2my$$.1 | wc -l | tr -cd '0-9\n'
	cat /tmp/i2my$$.1
	echo
	rm -f /tmp/i2my$$.1

	LC_ALL=$LC_SAVE
	export LC_ALL
	exit
	;;
esac

# convert affixum file

cat "$1" |
sed 's/[ 	]*#.*$//
s/`,'"'"'/,/g' |
grep -v '^ *$' |
sed -n '/prefixes/,$p' |
tr '\t' ' ' |
awk '
  /flag *\*/ { flag=$2; next }
  /flag/ { flag=" " $2; next }
  />/ {print "  flag " flag " " $0; next}
  {print $0}' |
sed 's/: */: /
s/: *> */: .	>	/
s/[ 	]*>[ 	]*/	>	/
s/\(>[^ ]*\) */\1	/
s/\(>[^ ]*\) */\1	/' |

sed 's/ //g
/prefixes/,/suffixes/s/flag[*]\(.\):/PFX \1 Y /
/prefixes/,/suffixes/s/flag\(.\):/PFX \1 N /
/suffixes/,//s/flag[*]\(.\):/SFX \1 Y /
/suffixes/,//s/flag\(.\):/SFX \1 N /
s/\([^ ]*\).>.\([^,	]*\)$/0 \2 \1/
s/\([^ ]*\).>.-\([^,	]*\),\([^	]*\)$/\2 \3 \1/
s/\([^ ]*\).>.\([^,	]*\).\([^	]*\)$/0 \2 \1 \3/
s/\([^ ]*\).>.-\([^,	]*\),\([^	]*\).\([^	]*\)$/\2 \3 \1 \4/
s/\([^ ]*\).>.\([^,	]*\).\([^	]*\).\([^	]*\)$/0 \2 \1 \3 \4/
s/\([^ ]*\).>.-\([^,	]*\),\([^	]*\).\([^	]*\).\([^	]*\)$/\2 \3 \1 \4 \5/' |

tee /tmp/i2my$$.1 | cut -c -7 >/tmp/i2my$$.2
cut -d " " -f 7- /tmp/i2my$$.1 >/tmp/i2my$$.3

# myspell affix table header
cat "$2"

UPPER=A-Z
LOWER=a-z
if [ -n "$3" ]; then UPPER=$3; fi;
if [ -n "$4" ]; then LOWER=$4; fi;

cut -d " " -f 4-6 /tmp/i2my$$.1 | # tr $UPPER $LOWER |
paste -d " " /tmp/i2my$$.2 - | 
paste -d " " - /tmp/i2my$$.3 |
egrep -v '(suffixes|prefixes|flagmarker)' |
sort -k 2 | $AWK '
    NR==1 { o1=$1; o2=$2; o3=$3; n[o2]=-1; }
    { 
    printf "%s %s   %-4s %-12s %-25s %s %s\n", $1, $2, $4, $5, $6, $7, $8;
    if ($2==o2) { 
	n[$2]++; 
    } else { 
	if (NR>2) printf "%s %s %s %s\n", o1, o2, o3, n[o2]+1; 
	o1=$1; o2=$2; o3=$3;
    }
} END { printf "%s %s %s %s\n", o1, o2, o3, n[o2]+1; } ' |
sed 's/  *$//' | sort -r | sed 's/\(^.* [0-9]\)$/\
\1/' 
rm -f /tmp/i2my$$.*
LC_ALL=$LC_SAVE
export LC_ALL