File: PPMLanguageModel.h

package info (click to toggle)
dasher 4.11%2Bgit20130508.adc653-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 40,248 kB
  • ctags: 5,158
  • sloc: xml: 185,479; cpp: 32,301; sh: 11,207; makefile: 828; ansic: 483
file content (221 lines) | stat: -rw-r--r-- 7,888 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
// PPMLanguageModel.h
//
/////////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 1999-2005 David Ward
//
/////////////////////////////////////////////////////////////////////////////

#ifndef __PPMLanguageModel_h__
#define __PPMLanguageModel_h__

#include "../../Common/NoClones.h"
#include "../../Common/Allocators/PooledAlloc.h"

#include "LanguageModel.h"
#include "../SettingsStore.h"
#include "stdlib.h"
#include <vector>
#include <fstream>
#include <set>
#include <map>

namespace Dasher {

  ///
  /// \ingroup LM
  /// @{
  ///
  /// Common superclass for both PPM and PPMY language models. Implements the PPM tree,
  /// inc. fast hashing of child nodes by symbol number; and entering and learning symbols
  /// in a context, i.e. navigating and updating the tree, with update exclusion according
  /// to LP_LM_UPDATE_EXCLUSION
  ///
  /// Subclasses must implement CLanguageModel::GetProbs and a makeNode() method (perhaps
  /// using a pooled allocator).
  ///
  class CAbstractPPM :public CLanguageModel, protected CSettingsUser, private NoClones {
  protected:
    class ChildIterator;
    class CPPMnode {
    private:
      union {
        CPPMnode **m_ppChildren;
        CPPMnode *m_pChild;
      };
      ///Elements in above array, including nulls, as follows:
      /// (a) negative -> absolute value is number of elems in m_ppChildren, but use direct indexing
      /// (b) 1 -> use m_pChild as direct pointer to CPPMnode (no array)
      /// (c) 2-MAX_RUN -> m_ppChildren is unordered array of that many elems
      /// (d) >MAX_RUN ->  m_ppChildren is an inline hash (overflow to next elem) with that many slots
      int m_iNumChildSlots;
      friend class CPPMLanguageModel;
	  public:
      ChildIterator children() const;
      const ChildIterator end() const;
      void AddChild(CPPMnode *pNewChild, int numSymbols);
      CPPMnode * find_symbol(symbol sym)const;
      CPPMnode *vine;
      unsigned short int count;
      symbol sym;
      CPPMnode(symbol sym);
      CPPMnode();
      virtual ~CPPMnode();
      virtual bool eq(CPPMnode *other, std::map<CPPMnode *,CPPMnode *> &equivs);
	  };
    class ChildIterator {
    private:
      void nxt() {
        if (m_ppChild == m_ppStop) return;
        while ((--m_ppChild) != m_ppStop)
          if (*m_ppChild) break;
      }
    public:
      bool operator==(const ChildIterator &other) const {return m_ppChild==other.m_ppChild && m_ppStop == other.m_ppStop;}
      bool operator!=(const ChildIterator &other) const {return m_ppChild!=other.m_ppChild || m_ppStop!=other.m_ppStop;}
      CPPMnode *operator*() const {return (m_ppChild == m_ppStop) ? NULL : *m_ppChild;}
      ChildIterator &operator++() {nxt(); return *this;} //prefix
      ChildIterator operator++(int) {ChildIterator temp(*this); nxt(); return temp;}
      //operator CPPMnode *() {return node;} //implicit conversion
      //operator bool();                     //implicit conversion 2
      ChildIterator(CPPMnode *const *ppChild, CPPMnode *const *ppStop) : m_ppChild(ppChild), m_ppStop(ppStop) {nxt();}
    private:
      CPPMnode *const *m_ppChild, *const *m_ppStop;
    };

    class CPPMContext {
    public:
      CPPMContext(CPPMContext const &input) {
        head = input.head;
        order = input.order;
      } CPPMContext(CPPMnode * _head = 0, int _order = 0):head(_head), order(_order) {
      };
      ~CPPMContext() {
      };
      void dump();
      CPPMnode *head;
      int order;
    };
    
    ///Makes a new node, of whatever kind (subclass of CPPMnode, perhaps with extra info)
    /// is required by the subclass, for the specified symbol. (Initial count will be 1.)
    virtual CPPMnode *makeNode(int sym)=0;
    /// \param iMaxOrder max order of model; anything <0 means to use LP_LM_MAX_ORDER.
    CAbstractPPM(CSettingsUser *pCreator, int iNumSyms, CPPMnode *pRoot, int iMaxOrder=-1);
    
    void dumpSymbol(symbol sym);
    void dumpString(char *str, int pos, int len);
    void dumpTrie(CPPMnode * t, int d);
    
    CPPMContext *m_pRootContext;
    CPPMnode *m_pRoot;
    
    /// Cache parameters that don't make sense to adjust during the life of a language model...
    const int m_iMaxOrder; 
    const bool bUpdateExclusion;
    
  public:
    virtual bool eq(CAbstractPPM *other);
    virtual ~CAbstractPPM() {};

    Context CreateEmptyContext();
    void ReleaseContext(Context context);
    Context CloneContext(Context context);

    virtual void EnterSymbol(Context context, int Symbol);
    virtual void LearnSymbol(Context context, int Symbol);

    void dump();
    bool isValidContext(const Context c) const ;
  private:
    CPPMnode *AddSymbolToNode(CPPMnode * pNode, symbol sym);

    CPooledAlloc < CPPMContext > m_ContextAlloc;
    
    std::set<const CPPMContext *> m_setContexts;
  };

  ///"Standard" PPM language model: GetProbs uses counts in PPM child nodes,
  /// universal alpha+beta values read from LP_LM_ALPHA and LP_LM_BETA,
  /// max order from LP_LM_MAX_ORDER.
  class CPPMLanguageModel : public CAbstractPPM {
  public:
    CPPMLanguageModel(CSettingsUser *pCreator, int iNumSyms);
    virtual void GetProbs(Context context, std::vector < unsigned int >&Probs, int norm, int iUniform) const;
  protected:
    /// Makes a standard CPPMnode, but using a pooled allocator (m_NodeAlloc) - faster!
    virtual CPPMnode *makeNode(int sym);
    
    virtual bool WriteToFile(std::string strFilename);
    virtual bool ReadFromFile(std::string strFilename);
  private:
    int NodesAllocated;

    bool RecursiveWrite(CPPMnode *pNode, CPPMnode *pNextSibling, std::map<CPPMnode *, int> *pmapIdx, int *pNextIdx, std::ofstream *pOutputFile);
    int GetIndex(CPPMnode *pAddr, std::map<CPPMnode *, int> *pmapIdx, int *pNextIdx);
    CPPMnode *GetAddress(int iIndex, std::map<int, CPPMnode*> *pMap);

    mutable CSimplePooledAlloc < CPPMnode > m_NodeAlloc;
  };

  /// @}
  inline CAbstractPPM::ChildIterator CPPMLanguageModel::CPPMnode::children() const {
    //if m_iNumChildSlots = 0 / 1, m_ppChildren is direct pointer, else ptr to array (of pointers)
    CPPMnode *const *ppChild = (m_iNumChildSlots == 0 || m_iNumChildSlots == 1) ? &m_pChild : m_ppChildren;
    return ChildIterator(ppChild + abs(m_iNumChildSlots), ppChild - 1);
  }
  
  inline const CAbstractPPM::ChildIterator CPPMLanguageModel::CPPMnode::end() const {
    //if m_iNumChildSlots = 0 / 1, m_ppChildren is direct pointer, else ptr to array (of pointers)
    CPPMnode *const *ppChild = (m_iNumChildSlots == 0 || m_iNumChildSlots == 1) ? &m_pChild : m_ppChildren;
    return ChildIterator(ppChild, ppChild - 1);
  }

  inline Dasher::CAbstractPPM::CPPMnode::CPPMnode(symbol _sym): sym(_sym) {
    vine = 0;
    m_iNumChildSlots = 0;
    m_ppChildren = NULL;
    count = 1;
  }

  inline CAbstractPPM::CPPMnode::CPPMnode() {
    vine = 0;
    m_iNumChildSlots = 0;
    m_ppChildren = NULL;
    count = 1;
  }
  
  inline CAbstractPPM::CPPMnode::~CPPMnode() {
    //single child = is direct pointer to node, not array...
    if (m_iNumChildSlots != 1)
      delete[] m_ppChildren;
  }

  inline CLanguageModel::Context CAbstractPPM::CreateEmptyContext() {
    CPPMContext *pCont = m_ContextAlloc.Alloc();
    *pCont = *m_pRootContext;

    m_setContexts.insert(pCont);

    return (Context) pCont;
  }

  inline CLanguageModel::Context CAbstractPPM::CloneContext(Context Copy) {
    CPPMContext *pCont = m_ContextAlloc.Alloc();
    CPPMContext *pCopy = (CPPMContext *) Copy;
    *pCont = *pCopy;

    m_setContexts.insert(pCont);

    return (Context) pCont;
  }

  inline void CAbstractPPM::ReleaseContext(Context release) {

    m_setContexts.erase(m_setContexts.find((CPPMContext *) release));

    m_ContextAlloc.Free((CPPMContext *) release);
  }
}                               // end namespace Dasher

#endif // __LanguageModelling__PPMLanguageModel_h__