File: frequency-normalizer.pl

package info (click to toggle)
varnam-schemes 1.8.0-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 8,500 kB
  • sloc: ruby: 1,284; python: 194; sh: 72; perl: 28; makefile: 13
file content (53 lines) | stat: -rw-r--r-- 1,336 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# Reweigh the words
# Sample: word 8671269 to word 200
# Source: command line argument
# Output to terminal

# Biggest value = # of lines.
# Divide this by 240 and round up (255-14 to avoid 0-15 values)
# Divide all other values (lines left in the list) by that number and round down.
# All values should now be between 15 and 254.

if( $#ARGV != 2 ){
    print "Need 3 arguments: <file> <min> <max>\n";
    die;
}

# Open original file
use utf8;
open FILE, $ARGV[0] or die $!;
my $count=0;

my $min = $ARGV[1];
my $max = $ARGV[2];

# Count the # of lines
while (<FILE>) {
    $count++;
}

# Calculate the divider to ensure results between min and max
my $divider = int( $count / ($max - $min)) + 1;

sub is_integer { $_[0] =~ /^[+-]?\d+$/ }
# Re-open the source file and update the weight
open FILE, "<:encoding(utf8)", $ARGV[0] or die $!;

# remove ’, “, ।, —, ‘, ·, −, °, ”, ॥
while (my $line = <FILE>) {
    $count--;

    # Replace the weight if its a word line,
    # otherwise print without actions
    if ($line =~ /\s/) {
        my $weighed = int( $count / $divider) + $min;
        my ($name) = $line =~ m/(.*)\s/;
        if (length($name) > 1 && !is_integer($name)) {
            $line =~ s/(\d*[.])?\d+/$weighed/g;
            utf8::encode($line);
            print $line;
        }
    }
}

close FILE;