File: VcfFilter.g

package info (click to toggle)
snpsift 5.2.e%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 28,968 kB
  • sloc: java: 18,365; xml: 159; sh: 108; makefile: 19
file content (140 lines) | stat: -rw-r--r-- 4,421 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
//-----------------------------------------------------------------------------
//
// Filter parsing grammar
//			Pablo Cingolani
//
//-----------------------------------------------------------------------------
grammar VcfFilter;

options {
  // We're going to output an AST.
  output = AST;
}

// Tokens (reserved words)
tokens {
	CONDITION;
	OP_BINARY;
	OP_UNARY;	
	VAR_FIELD;
	VAR_SUBFIELD;
	VAR_GENOTYPE;
	VAR_GENOTYPE_SUB;
	VAR_EFF_SUB;
	VAR_LOF_SUB;
	VAR_NMD_SUB;
	VAR_GENOTYPE_SUB_ARRAY;
	FUNCTION_ENTRY;
	FUNCTION_BOOL_GENOTYPE;
	FUNCTION_BOOL_SET;
	LITERAL_NUMBER;
	LITERAL_STRING;
}

@lexer::header {
package ca.mcgill.mcb.pcingola.snpSift.antlr;
}

@header {
package ca.mcgill.mcb.pcingola.snpSift.antlr;
}


//-----------------------------------------------------------------------------
// Lexer
//-----------------------------------------------------------------------------

// Send runs of space and tab characters to the hidden channel.        
WS		: (' ' | '\t')+ { $channel = HIDDEN; };

// Treat runs of newline characters as a single NEWLINE token.
// On some platforms, newlines are represented by a \n character.
// On others they are represented by a \r and a \n character.
NEWLINE		: ('\r'? '\n')+ { $channel=HIDDEN; };

// A number is a set of digits
fragment NUMBER	: (DIGIT)+;

// A DIGIT
fragment DIGIT	: '0'..'9' ;

// A letter
fragment LETTER	: LOWER | UPPER;
fragment LOWER	: 'a'..'z';
fragment UPPER	: 'A'..'Z';

// Letter or digit
fragment ALPHANUM 	:	LETTER | DIGIT;

// 'C' style single line comments
COMMENT_SL : '//' ~('\r' | '\n')* NEWLINE	{ $channel=HIDDEN; };

// FLOAT number (float/double) without any signNUMBER
FLOAT  :   ('+'|'-')? NUMBER ( '.' NUMBER )? (('e'|'E') ('+'|'-')? NUMBER)? ;

// A string literal
 STRING: '\'' ~( '\n' | '\r' | '\'' )* '\'' { setText(getText().substring( 1, getText().length()-1 ) ); } ;

// An identifier.
ID : (ALPHANUM | '_' | '.' )*;

//-----------------------------------------------------------------------------
// Parser
//-----------------------------------------------------------------------------

// FCL file may contain several funcion blocks
main		:	f=condition -> ^(CONDITION $f);

condition	:	subcondition (boolOperator^ subcondition)*;
subcondition	:	('!'^)? (bare | paren);
bare		:	unaryExpr | binaryExpr | functionBoolean ;
paren 		:	'('! condition ')'!;

// Operations always are in parenthesis
binaryExpr	:	l=expression o=binOperator r=expression 			-> ^(OP_BINARY $o $l $r);
unaryExpr	:	o=uniOperator e=expression					-> ^(OP_UNARY $o $e);

// All these return a boolean
boolOperator  	:	'&' | '|';
binOperator  	:	'='  | '>='  | '>' | '<=' | '<'  | '!=' | '=~' | '!~' ;
uniOperator  	: 	'!' | 'na' | 'exists';				

// Variables, functions or literals (these are values
expression	:	var 
			| functionEntry
			| literalFloat 
			| literalString;


literalFloat	:	f=FLOAT								-> ^(LITERAL_NUMBER $f);
literalString	:	s=STRING							-> ^(LITERAL_STRING $s);
	
// Variables
var 			:	varField | varSubfield | varGenotypeSub | varGenotypeSubArray | varEffSub | varLofSub | varNmdSub;
varField		:	i=ID | i='EFF' | i='LOF' | i='NMD'			-> ^(VAR_FIELD $i);
varSubfield		:	i=ID '[' n=index ']'					-> ^(VAR_SUBFIELD $i $n);
varGenotype		:	'GEN' '[' g=index ']' 					-> ^(VAR_GENOTYPE $g);
varGenotypeSub		:	'GEN' '[' g=index '].' i=ID				-> ^(VAR_GENOTYPE_SUB $g $i);
varGenotypeSubArray	:	'GEN' '[' g=index '].' i=ID  '[' n=index ']'		-> ^(VAR_GENOTYPE_SUB_ARRAY $g $i $n);
varEffSub		:	'EFF' '[' g=index '].' i=ID				-> ^(VAR_EFF_SUB $g $i);
varLofSub		:	'LOF' '[' g=index '].' i=ID				-> ^(VAR_LOF_SUB $g $i);
varNmdSub		:	'NMD' '[' g=index '].' i=ID				-> ^(VAR_NMD_SUB $g $i);

// Functions based on the whole VCF entry information
functionEntry		:	f=functionEntryName '(' ')'				-> ^(FUNCTION_ENTRY $f);
functionEntryName	:	'countHom' | 'countHet' | 'countVariant' | 'countRef';

// Boolean functions (return TRUE or FALSE)
functionBoolean		:	functionGenotypeBool 
				| functionBooleanSet
				;

// Function on set
functionBooleanSet	:	e=expression f='in' 'SET' '[' i=index ']' 		-> ^(FUNCTION_BOOL_SET $f $i $e);

// Boolean Genotype functions (return TRUE or FALSE)
functionGenotypeBool	:	f=functionGenotypeBoolName '(' g=varGenotype ')'	-> ^(FUNCTION_BOOL_GENOTYPE $f $g);
functionGenotypeBoolName	:	'isHom' | 'isHet' | 'isVariant' | 'isRef';

// You can use '*' for 'any'
index 		:	FLOAT | '*' | 'ANY' | '?' | 'ALL';