File: syntax.cc

package info (click to toggle)
ht 0.5.0-1
links: PTS
area: main
in suites: woody
size: 3,388 kB
ctags: 9,064
sloc: cpp: 51,336; ansic: 11,954; sh: 2,742; yacc: 1,142; lex: 396; makefile: 178
file content (366 lines) | stat: -rw-r--r-- 9,670 bytes
/* 
 *	HT Editor
 *	syntax.cc
 *
 *	Copyright (C) 2001 Stefan Weyergraf (stefan@weyergraf.de)
 *
 *	This program is free software; you can redistribute it and/or modify
 *	it under the terms of the GNU General Public License version 2 as
 *	published by the Free Software Foundation.
 *
 *	This program is distributed in the hope that it will be useful,
 *	but WITHOUT ANY WARRANTY; without even the implied warranty of
 *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *	GNU General Public License for more details.
 *
 *	You should have received a copy of the GNU General Public License
 *	along with this program; if not, write to the Free Software
 *	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#include <stdlib.h>
#include <string.h>
#include <sys/types.h>

#include "htexcept.h"
extern "C" {
#include "regex.h"
}
#include "syntax.h"

bool regmatch(char *str, regex_t *preg, int *len)
{
	regmatch_t m;
	int r=regexec(preg, str, 1, &m, 0);
	if ((r==0) && (m.rm_so==0)) {
		if (len) *len=m.rm_eo-m.rm_so;
		return true;
	}
	return false;
}

bool match_sorted_stringtable(char *s, UINT slen, char **strings, UINT strings_count)
{
	int a=0, b=strings_count-1;
	int m;
	while (a<=b) {
		m=(a+b)>>1;
		UINT x=strlen(strings[m]);
		if (slen>x) x=slen;
		int d=strncmp(s, strings[m], x);
		if (d<0) b=m-1; else
			if (d>0) a=m+1; else return true;
	}
	return false;
}

/*
 *	CLASS ht_syntax_lexer
 */


/*
 *	CLASS ht_lang_syntax_lexer
 */

#define LSTSET(state) (1<<(state))

#define SL_RULE_ANYCHAR(needstates, token)\
	{ needstates,\
	  false, LRST_ANYCHAR, NULL, 0, token }

#define SL_RULE_LINEEND(needstates, state)\
	{ needstates,\
	  false, LRST_REGEX, "$", state, 0 }

void ht_lang_syntax_lexer::init(syntax_lexer_rule *lr)
{
	ht_syntax_lexer::init();
	set_lexer_rules(lr);
}

void ht_lang_syntax_lexer::done()
{
	free_lexer_rules();
	ht_syntax_lexer::done();
}

void ht_lang_syntax_lexer::free_lexer_rules()
{
	for (int i=0; i<lexer_rules_count; i++) {
		if (lexer_rules[i].string_type==LRST_REGEX) {
			regfree((regex_t*)lexer_rules_precompiled[i]);
			free(lexer_rules_precompiled[i]);
		} else {
		}
	}
	free(lexer_rules_precompiled);
}

lexer_token ht_lang_syntax_lexer::gettoken(char *buf, lexer_state *state, text_pos *p, UINT *len, bool start_of_line, bool only_state_changers)
{
	syntax_lexer_rule *lr=lexer_rules;
	int i=0;
	while (lr->needstate) {
		if ((lr->needstate & LSTSET(*state)) &&
		(!lr->need_line_start || start_of_line) &&
		(!only_state_changers || ((lr->state!=*state) && (lr->state)))) {
			int l=0;
			bool matched=false;
			switch (lr->string_type) {
				case LRST_ANYCHAR: {
					if (*buf) l=1;
					break;
				}
				case LRST_WHITESPACE: {
					if (((byte)*buf<=32) && (*buf)) l=1;
					break;
				}
				case LRST_STRING: {
					if (strncmp(buf, lr->string, strlen(lr->string))==0) {
						l=strlen(lr->string);
					}
					break;
				}
				case LRST_REGEX: {
					if ((strcmp(lr->string, "$")==0) && (!*buf)) {
						matched=true;
					} else if (regmatch(buf, (regex_t*)lexer_rules_precompiled[i], &l)) {
					}
					break;
				}
				case LRST_CHARSET: {
					if (*buf) {
						if (strchr(lr->string, *buf)) l=1;
					}
					break;
				}
				case LRST_EMPTY:
					break;
			}
			if (matched || l) {
				if (lr->state) *state=lr->state;
				*len=l;
				return lr->token;
			}
		}
		lr++;
		i++;
	}
/* error, no rule matched... */
	if (*buf) {
		*len=1;
		return geterrortoken();
	}
	*len=0;
	return 0;
}

void ht_lang_syntax_lexer::set_lexer_rules(syntax_lexer_rule *lr)
{
	lexer_rules=lr;
	
	lexer_rules_count=0;
	while (lexer_rules[lexer_rules_count].needstate) {
		lexer_rules_count++;
	}
	lexer_rules_precompiled=(void**)malloc(sizeof (void**) * lexer_rules_count);
	for (int i=0; i<lexer_rules_count; i++) {
		if (lexer_rules[i].string_type==LRST_REGEX) {
			regex_t *preg=(regex_t*)malloc(sizeof (regex_t));

			/* add an anchor in front of regex */
			int rl=strlen(lexer_rules[i].string)+1;
			char *regex=(char*)malloc(1+rl);
			*regex='^';
			memmove(regex+1, lexer_rules[i].string, rl);
			
			if (regcomp(preg, regex, REG_EXTENDED))
				throw ht_exception();
				
			free(regex);
			
			lexer_rules_precompiled[i]=preg;
		} else {
			lexer_rules_precompiled[i]=NULL;
		}
	}
}

/*
 *	CLASS ht_c_syntax_lexer
 */

/* C lexer states */
#define LEX_CST_NORMAL			1
#define LEX_CST_STRING			2
#define LEX_CST_PREPROCESS		3
#define LEX_CST_COMMENT			4
#define LEX_CST_COMMENT_EOL		5

/* C lexer tokens */
#define LEX_CTOK_ERROR			1
#define LEX_CTOK_WHITESPACE		2
#define LEX_CTOK_COMMENT			3
#define LEX_CTOK_PREPROCESS		4
#define LEX_CTOK_IDENTIFIER		5
#define LEX_CTOK_RIDENTIFIER		6
#define LEX_CTOK_NUMBER			7
#define LEX_CTOK_FNUMBER			8
#define LEX_CTOK_STRING			9
#define LEX_CTOK_CHAR			10
#define LEX_CTOK_SYMBOL			11

syntax_lexer_rule c_syntax_lexer_rules[]={
/* preprocessor directives */
	{ LSTSET(LEX_CST_NORMAL),
	  true, LRST_REGEX, " *#", LEX_CST_PREPROCESS, LEX_CTOK_PREPROCESS },
	SL_RULE_ANYCHAR(LSTSET(LEX_CST_PREPROCESS), LEX_CTOK_PREPROCESS),
	SL_RULE_LINEEND(LSTSET(LEX_CST_PREPROCESS), LEX_CST_NORMAL),
/* whitespaces */
	{ LSTSET(LEX_CST_NORMAL),
	  false, LRST_WHITESPACE, NULL, 0, LEX_CTOK_WHITESPACE },
/* '/ *' - '* /' multiline comments */
	{ LSTSET(LEX_CST_NORMAL) | LSTSET(LEX_CST_PREPROCESS),
	  false, LRST_STRING, "/*", LEX_CST_COMMENT, LEX_CTOK_COMMENT },
	{ LSTSET(LEX_CST_COMMENT),
	  false, LRST_STRING, "*/", LEX_CST_NORMAL, LEX_CTOK_COMMENT },
	SL_RULE_ANYCHAR(LSTSET(LEX_CST_COMMENT), LEX_CTOK_COMMENT),
/* "..." (multiline) strings */
	{ LSTSET(LEX_CST_NORMAL),
	  false, LRST_STRING, "\"", LEX_CST_STRING, LEX_CTOK_STRING },
	{ LSTSET(LEX_CST_STRING),
	  false, LRST_STRING, "\\\"", LEX_CST_STRING, LEX_CTOK_STRING },
	{ LSTSET(LEX_CST_STRING),
	  false, LRST_STRING, "\"", LEX_CST_NORMAL, LEX_CTOK_STRING },
	SL_RULE_ANYCHAR(LSTSET(LEX_CST_STRING), LEX_CTOK_STRING),
/* '//' one line comments */
	{ LSTSET(LEX_CST_NORMAL) | LSTSET(LEX_CST_PREPROCESS),
	  false, LRST_STRING, "//", LEX_CST_COMMENT_EOL, LEX_CTOK_COMMENT },
	SL_RULE_ANYCHAR(LSTSET(LEX_CST_COMMENT_EOL), LEX_CTOK_COMMENT),
	SL_RULE_LINEEND(LSTSET(LEX_CST_COMMENT_EOL), LEX_CST_NORMAL),
/* symbols */
	{ LSTSET(LEX_CST_NORMAL),
	  false, LRST_CHARSET, "(){};,.[]!~%+-/*=<>|&^?:", 0, LEX_CTOK_SYMBOL },
/* identifiers */
	{ LSTSET(LEX_CST_NORMAL),
	  false, LRST_REGEX, "[_a-zA-Z][_a-zA-Z0-9]*", 0, LEX_CTOK_IDENTIFIER },
/* floats */
	{ LSTSET(LEX_CST_NORMAL),
	  false, LRST_REGEX, "[0-9]+\\.[0-9]+(e[+-]?[0-9]+)?", 0, LEX_CTOK_FNUMBER },
/* numbers */
	{ LSTSET(LEX_CST_NORMAL),
	  false, LRST_REGEX, "0[xX][0-9a-fA-F]+", 0, LEX_CTOK_NUMBER },
	{ LSTSET(LEX_CST_NORMAL),
	  false, LRST_REGEX, "[0-9]+", 0, LEX_CTOK_NUMBER },
/* chars */
	{ LSTSET(LEX_CST_NORMAL),
	  false, LRST_REGEX, "'[^'\\]'", 0, LEX_CTOK_CHAR },
	{ LSTSET(LEX_CST_NORMAL),
	  false, LRST_REGEX, "'\\\\.{1,3}'", 0, LEX_CTOK_CHAR },
/**/
	{ 0, 0, LRST_EMPTY, false, false, 0 }
};

char *c_reserved[]=
{
/* types */
	"bool", "char", "void", "int", "short", "long",
	"unsigned", "signed", "float", "double",
/* consts */
	"true", "false",
/* statements */
	"return", "if", "else", "while", "do", "goto", "asm",
	"switch", "case", "default", "break", "continue", "for",
/* object */
	"new", "delete", "this",
/* declarations */
	"struct", "union", "enum", "class", "template", "operator",
	"typedef",
/* modifiers */
	"public", "protected", "private", "friend", "const",
	"extern", "inline", "register", "static", "volatile",
/* exceptions */
	"try", "catch", "throw",
/* misc */
	"sizeof",
/**/
	NULL
};

void ht_c_syntax_lexer::init()
{
	ht_lang_syntax_lexer::init(c_syntax_lexer_rules);
	c_reserved_sorted=create_sorted_stringtable(c_reserved);

	char **table=c_reserved;
	
	char **x=table;
	while (*x) x++;
	c_reserved_count=x-table;
}

void ht_c_syntax_lexer::done()
{
	free(c_reserved_sorted);
	ht_lang_syntax_lexer::done();
}

lexer_state ht_c_syntax_lexer::getinitstate()
{
	return LEX_CST_NORMAL;
}

lexer_token ht_c_syntax_lexer::geterrortoken()
{
	return LEX_CTOK_ERROR;
}

lexer_token ht_c_syntax_lexer::gettoken(char *buf, lexer_state *state, text_pos *p, UINT *len, bool start_of_line, bool only_state_changers)
{
	lexer_token t=ht_lang_syntax_lexer::gettoken(buf, state, p, len, start_of_line, only_state_changers);
	if (t==LEX_CTOK_IDENTIFIER) {
		if (match_sorted_stringtable(buf, *len, c_reserved_sorted, c_reserved_count)) {
			t=LEX_CTOK_RIDENTIFIER;
		}
	}
	return t;
}

vcp ht_c_syntax_lexer::gettoken_color(lexer_token t)
{
	switch (t) {
		case LEX_CTOK_WHITESPACE: return VCP(VC_TRANSPARENT, VC_TRANSPARENT);
		case LEX_CTOK_COMMENT: return VCP(VC_WHITE, VC_TRANSPARENT);
		case LEX_CTOK_PREPROCESS: return VCP(VC_LIGHT(VC_GREEN), VC_TRANSPARENT);
		case LEX_CTOK_IDENTIFIER: return VCP(VC_LIGHT(VC_YELLOW), VC_TRANSPARENT);
		case LEX_CTOK_RIDENTIFIER: return VCP(VC_LIGHT(VC_WHITE), VC_TRANSPARENT);
		case LEX_CTOK_NUMBER: return VCP(VC_LIGHT(VC_CYAN), VC_TRANSPARENT);
		case LEX_CTOK_FNUMBER: return VCP(VC_LIGHT(VC_MAGENTA), VC_TRANSPARENT);
		case LEX_CTOK_STRING: return VCP(VC_CYAN, VC_TRANSPARENT);
		case LEX_CTOK_CHAR: return VCP(VC_LIGHT(VC_CYAN), VC_TRANSPARENT);
		case LEX_CTOK_SYMBOL: return VCP(VC_LIGHT(VC_WHITE), VC_TRANSPARENT);
	}
	return VCP(VC_BLACK, VC_RED);
}

/*
 *	sorted stringtable
 */

int qsort_stringlist(const void *e1, const void *e2)
{
	return strcmp(*(char **)e1, *(char **)e2);
}
	
char **create_sorted_stringtable(char **table)
{
	char **x=table;
	while (*x) x++;
	char **stab=(char **)malloc(sizeof (char*) * (x-table+1));
	memmove(stab, table, sizeof (char*) * (x-table+1));
	
	qsort(stab, x-table, sizeof(char*), qsort_stringlist);
	return stab;
}