File: mark_dubious_translations.pl

package info (click to toggle)
freedict-tools 0.7.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,520 kB
  • sloc: python: 2,869; haskell: 1,999; perl: 1,509; yacc: 502; sh: 435; sed: 392; makefile: 141; xml: 10
file content (142 lines) | stat: -rw-r--r-- 2,940 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/perl

use strict;
use utf8;
use LWP::Simple;
use Getopt::Std;

our($opt_l, $opt_s, $opt_m,$opt_h);
getopts('l:s:m:h');


sub HELP_MESSAGE
{
  print <<EOT;

$0 -l <language name> [-s <sleep seconds>] [-m <mark>] [-h] [in_file [out_file]]

This script scans .tei dictionary for translations values marked with <quote>
tags, and tries to query http://en.wiktionary.org online dictionary for articles
titled as value found between <quote></quote>. If article is found and has a
section <language name> inside, then this translation is considered to be ok.
Otherwise translation is considered to be dubious and marked with a <mark> sign
at the beginning to the line.

Then you can use `diff -u10` and deal with dubious translation only.

This script can be useful if you start maintaining buggy dictionary and want to
eliminate typos and misspelling.


Options:

-h	help & exit
-l	language name as used in en.wiktionary.org section names. I.e. "Russian"
-s	sleep time between fetches in seconds, in order not to DDoS wiktionary.
	Default is 3
-m	a symbol for marking dubious translations. Default is '☹'

Usage examples:

  $0 -l Russian eng-rus.tei eng-rus.tei.marked
  
  $0 -l Russian <eng-rus.tei >eng-rus.tei.marked

Author:
  Nikolay Shaplov <dhyan\@nataraj.su>, 2014

License:
  GNU General Public License ver. 2.0 and any later version.

EOT
  exit
}

HELP_MESSAGE if $opt_h or (!$opt_l);

my $target_lang = $opt_l;
my $sleep = $opt_s || 3;
my $error_mark = $opt_m || '☹'; # do not use symblos that might be insde your dictionary!!!

my $in;
if ($ARGV[0])
{
  open $in, "<:utf8" , $ARGV[0] or die "cannot open ".$ARGV[0]." $!";
} else
{
  binmode(STDIN, ":utf8");
  $in = *STDIN;
}
my $out;
if ($ARGV[1])
{
  open $out, "<:utf8" , $ARGV[1] or die "cannot open ".$ARGV[1]." $!";
} else
{
  binmode(STDOUT, ":utf8");
  $out = *STDOUT;
}

binmode(STDERR, ":utf8");

my $buf = "";

my $text = "";
while (my $s = <$in>)
{
  $text.= $s;
}

my $res = "";
while ($text=~s{^(.*?)(<quote.*?>)(.*?)(</quote.*?>)}{}s)
{
  my $ok = 1;
  my $header = $1;
  my $open = $2;
  my $def = $3;
  my $close = $4;
  
  print STDERR $def;
  
  my $content = get("http://en.wiktionary.org/wiki/$def");
  
  if ( !($content =~/<h2><span class="mw-headline" id="$target_lang">$target_lang/s ))
  {
    $ok = 0;
    $open =~ s/</$error_mark</s;
  }
  
  print STDERR " Ok\n" if $ok;
  print STDERR " Error!\n" unless $ok;
  my_print("$header$open$def$close");
}

my_print($text);
print $out $buf;


# If you do not know perl well consider this function a magic, that moves $error_mark
# from the middle of the line to the beginning
sub my_print
{
  my $str = shift;
  my @l = split /\n/,$str,-1;
  
  if (int @l == 1)
  {
    $buf.= shift @l;
    return 0;
  }
  $l[0]=$buf.$l[0];
  
  while (int @l > 1)
  {
    my $s = shift @l;
    if ($s =~ s/$error_mark//g)
    {
      $s=$error_mark.$s;
    }
    print $out $s, "\n";
  }
  $buf = shift @l;
}