File: languageinfer.cpp

package info (click to toggle)
source-highlight 3.1.8-1.2~deb9u1
  • links: PTS
  • area: main
  • in suites: stretch
  • size: 10,224 kB
  • sloc: sh: 11,709; cpp: 10,226; ansic: 9,521; makefile: 1,902; lex: 1,200; yacc: 1,021; php: 213; perl: 211; awk: 98; erlang: 94; lisp: 90; java: 75; ruby: 69; python: 61; asm: 43; ada: 36; ml: 29; haskell: 27; xml: 23; cs: 11; sql: 8; tcl: 7; sed: 4
file content (160 lines) | stat: -rw-r--r-- 4,336 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
//
// C++ Implementation: languageinfer
//
// Description:
//
//
// Author: Lorenzo Bettini <http://www.lorenzobettini.it>, (C) 2006
//
// Copyright: See COPYING file that comes with this distribution
//
//

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "languageinfer.h"
#include "fileutil.h"

#include <boost/algorithm/string.hpp>
#include <boost/regex.hpp>
#include <vector>

using namespace std;

namespace srchilite {

LanguageInfer::LanguageInfer() {
}

LanguageInfer::~LanguageInfer() {
}

const string LanguageInfer::infer(const string &filename) {
    istream *stream = open_file_istream_or_error(filename);

    string result = infer(*stream);

    delete stream;

    return result;
}

const string guessEmacsMode(const string &modeline) {
    vector<string> strv;

    boost::split(strv, modeline, boost::is_any_of(";"));
    for (vector<string>::iterator it = strv.begin(); it != strv.end(); ++it) {
        boost::trim(*it);

        vector<string> tokens;
        boost::split(tokens, *it, boost::is_any_of(":"));
        boost::trim(tokens[0]);

        // A single token is considered a language definition
        if (tokens.size() == 1)
            return tokens[0];

        // otherwise, look for -*- mode: lang -*-
        boost::trim(tokens[1]);
        if (tokens[0] == "mode")
            return tokens[1];
    }

    return "";
}

const string LanguageInfer::infer(istream &stream) {
    // the regular expression for finding the language specification in a script file
    // this such as #! /bin/bash
    static boost::regex
            langRegEx(
                    "#[[:blank:]]*![[:blank:]]*(?:[\\./]*)(?:[[:alnum:]]+[\\./]+)*([[:alnum:]]+)");

    // the regular expression for finding the language specification in a script file
    // this such as #! /usr/bin/env perl
    static boost::regex
            langEnvRegEx(
                    "#[[:blank:]]*![[:blank:]]*(?:[\\./]*)(?:[[:alnum:]]+[\\./]+)*(?:env)[[:blank:]]+([[:alnum:]]+)");

    // the regular expression for finding the language specification in a script file
    // according to Emacs convention: # -*- language -*-
    static boost::regex
            langRegExEmacs("-\\*-[[:blank:]]*([[:print:]]+).*-\\*-");

    // the Emacs specification has the precedence in order to correctly infer
    // that scripts of the shape
    // #!/bin/sh
    // #  -*- tcl -*-
    // are Tcl scripts and not shell scripts

    // the regular expression for scripts starting with <?...
    // such as xml and php
    static boost::regex langXMLLikeScripts("<\\?([[:alnum:]]+)");

    // the regular expression for <!DOCTYPE
    static boost::regex langDocType("<![Dd][Oo][Cc][Tt][Yy][Pp][Ee]");

    string firstLine;
    string secondLine;

    // read only the first line of the input
    read_line(&stream, firstLine);
    // and the second line
    read_line(&stream, secondLine);

    boost::match_results<std::string::const_iterator> what;
    boost::match_results<std::string::const_iterator> whatEnv;
    boost::match_results<std::string::const_iterator> whatEmacs;

    // first try the emacs specification
    boost::regex_search(secondLine, whatEmacs, langRegExEmacs,
            boost::match_default);

    if (whatEmacs[1].matched) {
        string guess = guessEmacsMode(whatEmacs[1]);
        if (guess != "")
            return guess;
    }

    // try also on the first line
    boost::regex_search(firstLine, whatEmacs, langRegExEmacs,
             boost::match_default);

    if (whatEmacs[1].matched) {
        string guess = guessEmacsMode(whatEmacs[1]);
        if (guess != "")
            return guess;
    }

    // try also the env specification
    boost::regex_search(firstLine, whatEnv, langEnvRegEx, boost::match_default);

    if (whatEnv[1].matched)
        return whatEnv[1];

    // try the sha-bang specification
    boost::regex_search(firstLine, what, langRegEx, boost::match_default);

    if (what[1].matched)
        return what[1];

    // the xml like starting scripts
    boost::regex_search(firstLine, what, langXMLLikeScripts,
            boost::match_default);

    if (what[1].matched)
        return what[1];

    // the doctype case
    boost::regex_search(firstLine, what, langDocType,
            boost::match_default);

    if (what[0].matched)
        return "xml";

    return "";
}

}