File: I_Sub.java

package info (click to toggle)
swi-prolog 9.0.4%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 82,408 kB
  • sloc: ansic: 387,503; perl: 359,326; cpp: 6,613; lisp: 6,247; java: 5,540; sh: 3,147; javascript: 2,668; python: 1,900; ruby: 1,594; yacc: 845; makefile: 428; xml: 317; sed: 12; sql: 6
file content (207 lines) | stat: -rw-r--r-- 5,288 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
/*
 * Created on 18 Ïêô 2004
 */
package fr.onagui.alignment.method;

/**
 * @author Giorgos Stoilos
 * 
 * This class implements the string matching method proposed in the paper
 * "A String Metric For Ontology Alignment", published in ISWC 2005 
 *
 */
public class I_Sub{
	
	public static double score( String st1 , String st2 ){
		
		if( (st1 == null) || (st2 == null) ){
				return -1;
		}
		
		String s1 = st1.toLowerCase();
		String s2 = st2.toLowerCase();
		
		s1 = normalizeString( s1 , '.' );
		s2 = normalizeString( s2 , '.' );
		s1 = normalizeString( s1 , '_' );
		s2 = normalizeString( s2 , '_' );
		s1 = normalizeString( s1 , ' ' );
		s2 = normalizeString( s2 , ' ' );

		int l1 = s1.length(); // length of s
		int l2 = s2.length(); // length of t

		int L1 = l1;
		int L2 = l2;

		if ((L1 == 0) && (L2 == 0))
			return 1;
		if ((L1 == 0) || (L2 == 0))
			return -1;

		double common = 0;
		int best = 2;

		int max = Math.min(l1, l2); // the maximal length of a subs

		while( s1.length() >0 && s2.length() >0 && best !=0 ){
			best = 0; // the best subs length so far

			l1 = s1.length(); // length of s
			l2 = s2.length(); // length of t

			int i = 0; // iterates through s1
			int j = 0; // iterates through s2

			int startS2 = 0;
			int endS2 = 0;
			int startS1 = 0;
			int endS1 = 0;
			int p=0;

			for( i = 0; (i < l1) && (l1 - i > best); i++) {
				j = 0;
				while (l2 - j > best) {
					int k = i;
					for(;(j < l2) && (s1.charAt(k) != s2.charAt(j)); j++);
						//System.out.println( s1.charAt( k ) + " " + s2.charAt( j ) );
			
					if (j != l2) { // we have found a starting point
						//System.out.println( "j: " + j );
						p = j;
						for (j++, k++;
							(j < l2) && (k < l1) && (s1.charAt(k) == s2.charAt(j));
							j++, k++);
						if( k-i > best){
							best = k-i;
							startS1 = i;
							endS1 = k;
							startS2 = p;
							endS2 = j;	
						}
						//best = Math.max(best, k - i);
					}
				}
			}
			//Vector v = new Vector();
			//if( startS1 != endS1 )
			//	System.out.println(  s1.substring( startS1 , endS1 ) );
			char[] newString = new char[ s1.length() - (endS1 - startS1) ];
		
			j=0;
			for( i=0 ;i<s1.length() ; i++ ){
				if( i>=startS1 && i< endS1 )
					continue;
				newString[j++] = s1.charAt( i );			
			}

			s1 = new String( newString );

			newString = new char[ s2.length() - ( endS2 - startS2 ) ];
			j=0;
			for( i=0 ;i<s2.length() ; i++ ){
				if( i>=startS2 && i< endS2 )
					continue;
				newString[j++] = s2.charAt( i );
			}
			s2 = new String( newString );

			//if( (startS1 < 1 || startS1 > 2 )
			//	||	(startS2 < 1 || startS2 > 2) && startS1 != startS2  )
			//	best--;

			if( best > 2 )
				common += best;
			else
				best = 0;

		//System.out.println( s1 + ":" + s2 );
		//System.out.println( "StartS1 : " + startS1 + " EndS1: " + endS1 );
		//System.out.println( "StartS2 : " + startS2 + " EndS2: " + endS2 );
		}

		double commonality = 0;
		double scaledCommon = (double)(2*common)/(L1+L2);
		commonality = scaledCommon;

		double winklerImprovement = winklerImprovement( st1 , st2 , commonality );
		double dissimilarity = 0;

		double rest1 = L1 - common;
		double rest2 = L2 - common;

		double unmatchedS1 = Math.max( rest1 , 0 );
		double unmatchedS2 = Math.max( rest2 , 0 );
		unmatchedS1 = rest1/L1;
		unmatchedS2 = rest2/L2;
		
		/**
		 * Hamacher Product
		 */
		double suma = unmatchedS1 + unmatchedS2;
		double product = unmatchedS1 * unmatchedS2;
		double p = 0.6;   //For 1 it coincides with the algebraic product
		if( (suma-product) == 0 )
			dissimilarity = 0;
		else
			dissimilarity = (product)/(p+(1-p)*(suma-product));
		
		return commonality - dissimilarity + winklerImprovement;
	}
	
	private static double winklerImprovement( String s1 , String s2 , double commonality ){
		
		int i;
		//int n = Math.min( 4 , Math.min( s1.length() , s2.length() ) );
		int n = Math.min( s1.length() , s2.length() );
		for( i=0 ; i<n ; i++ )
			if( s1.charAt( i ) != s2.charAt( i ) )
				break;
		
		double commonPrefixLength = Math.min( 4 , i );
		//double commonPrefixLength = i;
		double winkler = commonPrefixLength*0.1*(1-commonality);

		return winkler;
	}

//	/* (non-Javadoc)
//	 * @see com.wcohen.ss.AbstractStringDistance#explainScore(com.wcohen.ss.api.StringWrapper, com.wcohen.ss.api.StringWrapper)
//	 */
//	public String explainScore(String s, String t) {
//		return null;
//	}
	
	public static String normalizeString( String str , char remo ){
		
		StringBuffer strBuf = new StringBuffer(); 
		
		int j=0;
		for( int i=0 ; i<str.length() ; i++ ){
			if( str.charAt( i ) != remo )
				strBuf.append( str.charAt( i ) );
		}
		return strBuf.toString();		
	}
	
	public static void main(String[] args) {
		System.out.println("Test de la mesure ISub");
		
		String[][] allTest = new String[][] {
				{ "store", "spore"},
				{ "numPages", "numberOfPages"},
				{ "DosageDuFacteurV", "MesureDuFacteurV"},
				{ "DosageDuFacteurV", "DosageDuFacteurX"},
				{ "DosageDuFacteurV", "MesureDuFacteurX"},
				{ "SyndromeDeKawasaki", "MaladieDeKawasaki"}
		};
		
		for(String[] oneTest : allTest) {
			System.out.println("\nTest: "+oneTest[0]+ " VS "+oneTest[1]);
			double score = I_Sub.score(oneTest[0], oneTest[1]);
			System.out.println("\t->Resultat: "+score);
		}
		
	}
}