File: concepts.xml

package info (click to toggle)
uima-addons 2.3.1-10
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 10,044 kB
  • sloc: java: 48,492; xml: 42,522; javascript: 53; makefile: 8; sh: 8
file content (118 lines) | stat: -rw-r--r-- 5,005 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
<?xml version="1.0" encoding="UTF-8"?>
<!--
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements.  See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership.  The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License.  You may obtain a copy of the License at
	* 
	*   http://www.apache.org/licenses/LICENSE-2.0
	* 
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied.  See the License for the
	* specific language governing permissions and limitations
	* under the License.
-->
<conceptSet xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xmlns="http://incubator.apache.org/uima/regex"
	xsi:schemaLocation="concept.xsd">

	<concept name="emailAddressDetection">
		<rules>
			<rule       
				regEx="([a-zA-Z0-9!#$%*+'/=?^_\x2D`{|}~.\x26]+)@([a-zA-Z0-9._-]+[a-zA-Z]{2,4})"
				matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation" />
		</rules>
		<createAnnotations>
			<annotation id="emailAnnot"
				type="org.apache.uima.EmailAddress">
				<begin group="0" />
				<end group="0" />
				<setFeature name="localPart" type="String"
					normalization="ToLowerCase">
					$1
				</setFeature>
				<setFeature name="domainPart" type="String"
					normalization="ToLowerCase">
					$2
				</setFeature>
				<setFeature name="normalizedEmail" type="String"
					normalization="ToLowerCase">
					$0
				</setFeature>
			</annotation>
		</createAnnotations>
	</concept>

	<concept name="isbnNumberDetection">
		<rules>
			<rule regEx="(97(8|9))?-?(\d{9}|(\d|-){11})-?(\d|X)"
				matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation"
				confidence="1.0" />
		</rules>
		<createAnnotations>
			<annotation id="isbnNumber"
				type="org.apache.uima.ISBNNumber"
				validate="org.apache.uima.annotator.regex.extension.impl.ISBNNumberValidator">
				<begin group="0" />
				<end group="0" />
				<setFeature name="confidence" type="Confidence" />
			</annotation>
		</createAnnotations>
	</concept>

	<concept name="creditCardNumberDetection" processAllRules="true">
		<rules>
			<rule ruleId="AmericanExpress"
				regEx="(((34|37)\d{2}[- ]?)(\d{6}[- ]?)\d{5})"
				matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation"
				confidence="1.0" />
			<rule ruleId="Visa"
				regEx="((4\d{3}[- ]?)(\d{4}[- ]?){2}\d{4})" matchStrategy="matchAll"
				matchType="uima.tcas.DocumentAnnotation" confidence="1.0" />
			<rule ruleId="MasterCard"
				regEx="((5[1-5]\d{2}[- ]?)(\d{4}[- ]?){2}\d{4})"
				matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation"
				confidence="1.0" />
			<rule ruleId="unknown"
				regEx="(([1-6]\d{3}[- ])(\d{4}[- ]){2}\d{4})|([1-6]\d{13,18})|([1-6]\d{3}[- ]\d{6}[- ]\d{5})"
				matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation"
				confidence="1.0" />
		</rules>
		<createAnnotations>
			<annotation id="creditCardNumber"
				type="org.apache.uima.CreditCardNumber"
				validate="org.apache.uima.annotator.regex.extension.impl.CreditCardNumberValidator">
				<begin group="0" />
				<end group="0" />
				<setFeature name="confidence" type="Confidence" />
				<setFeature name="cardType" type="RuleId" />
			</annotation>
		</createAnnotations>
	</concept>

	<concept name="MoneyAmountDetection" processAllRules="true">
	<!-- \p{Sc} -> currentySymbol -->
	<!-- (?i) -> case insensitive match -->
	<!-- \s -> whitespace character -->
		<rules>
  			<rule regEx="\m{currency}(\p{Sc}\s?|(?i)USD\s?|(?i)Dollars\s?|(?i)Dollar\s?|(?i)CNY\s?|(?i)CAD\s?|(?i)GBP\s?|(?i)Pounds\s?|(?i)Pound\s?|(?i)Euros\s?|(?i)Euro\s?|(?i)Yen\s?|(?i)EUR\s?)\m{amount}(\d+(,\d\d\d)*(\.\d\d?)?)\m{amountText}(\s?(?i)million|\s?(?i)billion)?"
      			  matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation"/> 
            <rule regEx="\m{amount}(\d+(,\d\d\d)*(\.\d\d?\d?)?)\m{amountText}(\s?(?i)million|\s?(?i)billion)?\m{currency}(\s?\p{Sc}|\s?(?i)USD\b|\s?(?i)Dollars\b|\s?(?i)Dollar\b|\s?(?i)CNY\b|\s?(?i)CAD\b|\s?(?i)GBP\b|\s?(?i)Pounds\b|\s?(?i)Pound\b|\s?(?i)Euros\b|\s?(?i)Euro\b|\s?(?i)Yen\b|\s?(?i)EUR\b)"
                  matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation"/>
		</rules>
		<createAnnotations>
			<annotation type="org.apache.uima.MoneyAmount">
				<begin group="0" />
				<end group="0" />
				<setFeature name="currency" type="String" normalization="Trim">${currency}</setFeature>
				<setFeature name="amount" type="Float">${amount}</setFeature>
				<setFeature name="amountText" type="String" normalization="Trim">${amountText}</setFeature>
			</annotation>
		</createAnnotations>
	</concept>
</conceptSet>