File: gentrigrams.cpp

package info (click to toggle)
sonnet 5.116.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 22,976 kB
  • sloc: cpp: 7,535; sh: 15; makefile: 9
file content (96 lines) | stat: -rw-r--r-- 2,615 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
/**
 * parsetrigrams.cpp
 *
 * Parse a corpus of data and generate trigrams
 *
 * SPDX-FileCopyrightText: 2013 Martin Sandsmark <martin.sandsmark@kde.org>
 *
 * SPDX-License-Identifier: LGPL-2.1-or-later
 */

#include "guesslanguage.h"

#include <QDebug>
#include <QFile>
#include <QHash>
#include <QString>

int main(int argc, char *argv[])
{
    if (argc < 3) {
        qWarning() << argv[0] << "corpus.txt outfile.trigram";
        return -1;
    }

    QFile file(QString::fromLocal8Bit(argv[1]));
    if (!file.open(QIODevice::ReadOnly | QFile::Text)) {
        qWarning() << "Unable to open corpus:" << argv[1];
        return -1;
    }
    QTextStream stream(&file);
    // Not needed with Qt6, UTF-8 is the default
#if QT_VERSION < QT_VERSION_CHECK(6, 0, 0)
    stream.setCodec("UTF-8");
#endif

    QFile outFile(QString::fromLocal8Bit(argv[2]));
    if (!outFile.open(QIODevice::WriteOnly)) {
        qWarning() << "Unable to open output file" << argv[2];
        return -1;
    }

    QHash<QString, int> model;
    qDebug() << "Reading in" << file.size() << "bytes";
    QString trigram = stream.read(3);
    QString contents = stream.readAll();
    qDebug() << "finished reading!";
    qDebug() << "Building model...";
    for (int i = 0; i < contents.size(); i++) {
        if (!contents[i].isPrint()) {
            continue;
        }
        model[trigram]++;
        trigram[0] = trigram[1];
        trigram[1] = trigram[2];
        trigram[2] = contents[i];
    }
    qDebug() << "model built!";

    qDebug() << "Sorting...";
    QMultiMap<int, QString> orderedTrigrams;

    for (auto it = model.cbegin(); it != model.cend(); ++it) {
        const QString data = it.key();
        Q_ASSERT(data.size() >= 3);
        bool hasTwoSpaces = ((data.size() > 1 && data[0].isSpace() && data[1].isSpace()) //
                             || (data.size() > 2 && data[1].isSpace() && data[2].isSpace()));

        if (!hasTwoSpaces) {
            orderedTrigrams.insert(it.value(), data);
        }
    }

    qDebug() << "Sorted!";

    qDebug() << "Weeding out...";

    auto i = orderedTrigrams.begin();
    while (orderedTrigrams.size() > Sonnet::MAXGRAMS) {
        i = orderedTrigrams.erase(i);
    }
    qDebug() << "Weeded!";

    qDebug() << "Storing...";
    i = orderedTrigrams.end();
    int count = 0;
    QTextStream outStream(&outFile);
    // Not needed with Qt6, UTF-8 is the default
#if QT_VERSION < QT_VERSION_CHECK(6, 0, 0)
    outStream.setCodec("UTF-8");
#endif

    while (i != orderedTrigrams.begin()) {
        --i;
        outStream << *i << "\t\t\t" << count++ << '\n';
    }
}