File: split-ngt.sh

package info (click to toggle)
irstlm 6.00.05-7
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,600 kB
  • sloc: cpp: 15,933; ansic: 1,590; sh: 1,293; perl: 831; makefile: 118
file content (89 lines) | stat: -rwxr-xr-x 1,956 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#! /bin/bash

function usage()
{
    cmnd=$(basename $0);
    cat<<EOF

$cmnd - creates partition files with ngram statistics in Google format

USAGE:
       $cmnd [options] <input> <output> <order> <parts>

DESCRIPTION:
       <input>   Input file name
       <output>  Partition files name prefix
       <order>   Order of the ngrams
       <parts>   Number of partitions

OPTIONS:
       -h        Show this message

EOF
}

# Parse options
while getopts h OPT; do
    case "$OPT" in
        h)
            usage >&2;
            exit 0;
            ;;
        * ) usage;
            exit 1;
						;;
    esac
done

#usage:
#ngt-split.sh [options] <input> <output> <size> <parts>
#It creates <parts> files (named <output.000>, ... <output.999>)
#containing ngram statistics (of <order> length) in Google format
#These files are a partition of the whole set of ngrams

basedir=$IRSTLM
bindir=$basedir/bin
scriptdir=$basedir/scripts

unset par
while [ $# -gt 0 ]
do
   echo "$0: arg $1"
   par[${#par[@]}]="$1"
   shift
done

inputfile=${par[0]}
outputfile=${par[1]}
order=${par[2]}
parts=${par[3]}

dictfile=dict$$


echo "Extracting dictionary from training corpus"
$bindir/dict -i="$inputfile" -o=$dictfile -f=y -sort=n

echo "Splitting dictionary into $parts lists"
$scriptdir/split-dict.pl --input $dictfile --output ${dictfile}. --parts $parts

rm $dictfile


echo "Extracting n-gram statistics for each word list"
echo "Important: dictionary must be ordered according to order of appearance of words in data"
echo "used to generate n-gram blocks,  so that sub language model blocks results ordered too"

for d in `ls ${dictfile}.*` ; do
w=`echo $d | perl -pe 's/.+(\.[0-9]+)$/$1/i'`
w="$outputfile$w"

sdict=`basename $sdict`
echo "Extracting n-gram statistics for $sdict"

echo "$bindir/ngt -i="$inputfile"  -n=$order -gooout=y -o=$w -fd=$d  > /dev/null"
$bindir/ngt -n=$order -gooout=y -o=$w -fd=$d -i="$inputfile"  > /dev/null
rm $d
done

exit 0