File: TagLex.cxx

package info (click to toggle)
mbt 3.4-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 2,976 kB
  • sloc: sh: 4,244; cpp: 3,351; makefile: 38; ansic: 15
file content (167 lines) | stat: -rw-r--r-- 4,094 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
/*
  Copyright (c) 1998 - 2018
  CLST  - Radboud University
  ILK   - Tilburg University
  CLiPS - University of Antwerp

  This file is part of mbt

  mbt is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 3 of the License, or
  (at your option) any later version.

  mbt is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program; if not, see <http://www.gnu.org/licenses/>.

  For questions and suggestions, see:
      https://github.com/LanguageMachines/mbt/issues
  or send mail to:
      lamasoftware (at ) science.ru.nl

*/

#include <algorithm>
#include <map>
#include <vector>
#include <iostream>
#include <cstdlib>
#include <cstring>
#include <string>

#include "ticcutils/StringOps.h"
#include "mbt/TagLex.h"

namespace Tagger {
  using namespace std;

  TagInfo::TagInfo( const string& name, const string& tag ):
    Word(name), WordFreq(0) {
    Update( tag );
  }

  TagInfo::~TagInfo(){
  }

  void TagInfo::Update( const string& tag ){
    ++WordFreq;
    ++TagFreqs[tag];
  }

  void TagInfo::Prune( int Threshold ){
    auto it = TagFreqs.begin();
    while ( it != TagFreqs.end() ){
      double perc = ( (double)it->second * 100 ) / ( double)WordFreq;
      if ( perc < Threshold )
	TagFreqs.erase( it++ );
      else
	++it;
    }
  }

  string TagInfo::DisplayTagFreqs( )const {
    string result;
    for( const auto& it : TagFreqs ){
      result += it.first + ":" + TiCC::toString(it.second) + " ";
    }
    return result;
  }

  struct FS {
    FS( int f, const string& s ):freq(f), str(s) {};
    int freq;
    string str;
  };

  int cmpFreq( const FS& p1, const FS& p2 ){
    return ( p2.freq < p1.freq );
  }

  void TagInfo::CreateStringRepr(){
    vector<FS> FreqTags;
    for ( const auto& it : TagFreqs ){
      FreqTags.push_back( FS( it.second, it.first) );
    }
    sort( FreqTags.begin(), FreqTags.end(), cmpFreq );
    string tmpstr;
    for ( auto const& it2 : FreqTags ){
      tmpstr += it2.str;
      if ( &it2 != &FreqTags.back() ){
	tmpstr += ";";
      }
    }
    StringRepr = tmpstr;
  }

  ostream& operator<<( ostream& os, TagInfo *LI ){
    if ( LI ){
      os << " " << LI->Word << ":" << LI->WordFreq
	 << " {" << LI->DisplayTagFreqs() << "} " << LI->StringRepr;
    }
    return os;
  }

  TagLex::TagLex(){
    TagTree = new Trie<TagInfo>;
    NumOfEntries = 0;
  }

  TagLex::~TagLex(){
    delete TagTree;
  }

  TagInfo *TagLex::Lookup( const string& name ){
    return reinterpret_cast<TagInfo *>(TagTree->Retrieve( name ));
  }

  TagInfo *TagLex::Store( const string& name, const string& tag ){
    TagInfo *info = TagTree->Retrieve( name );
    if ( !info ){
      NumOfEntries++;
      info = new TagInfo( name, tag );
      return TagTree->Store( name, info );
    }
    else
      info->Update( tag );
    return info;
  }

  void StoreInVector( TagInfo *TI, void *arg ){
    vector<TagInfo*> *vec = (vector<TagInfo*> *)arg;
    vec->push_back( TI );
  }

  bool ascendingInfo( const TagInfo* t1, const TagInfo* t2 ){
    //
    // sort on decending frequency
    // when same frequency, sort alphabetical
    // but: sort Uppercase words before lowercase when equal (e.g Land/land)
    //
    int diff = t2->Freq() - t1->Freq();
    if ( diff == 0 ){
      if ( TiCC::lowercase(t2->Word) == TiCC::lowercase(t1->Word) ){
	return t2->Word < t1->Word;
      }
      else {
	return t1->Word < t2->Word;
      }
    }
    return diff < 0;
  }

  vector<TagInfo *> TagLex::CreateSortedVector(){
    vector<TagInfo*> TagVec;
    TagTree->ForEachDo( StoreInVector, (void *)&TagVec );
    sort( TagVec.begin(), TagVec.end() , ascendingInfo );
    return TagVec;
  }

  ostream& operator<<( ostream& os, TagLex *L ){
    return os << L->TagTree; }

}