1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
|
package com.wcohen.ss;
import com.wcohen.ss.api.*;
import java.io.*;
import org.apache.log4j.Logger;
/**
* Needleman-Wunsch string distance, following Durban et al.
* Sec 2.3, but using an approximate string distance.
*/
public class ApproxNeedlemanWunsch extends AbstractStringDistance
{
private static Logger log=Logger.getLogger(ApproxNeedlemanWunsch.class);
private static final int DEFAULT_WIDTH = 40;
private CharMatchScore charMatchScore;
private double gapCost;
private MyMatrix mat;
private int width = DEFAULT_WIDTH;
public ApproxNeedlemanWunsch() { this(CharMatchScore.DIST_01, 1.0 ); }
public ApproxNeedlemanWunsch(CharMatchScore charMatchScore,double gapCost) {
this.charMatchScore = charMatchScore;
this.gapCost = gapCost;
}
public void setWidth(int w) { this.width=w; }
public double score(StringWrapper s,StringWrapper t) {
mat = new MyMatrix( s, t );
// fill matrix forward to prevent deep recursion
for (int i=1; i<=s.length(); i++) {
int j = (int)Math.round(i * mat.getScale());
if (j>=1 && j<=t.length()) {
double forceComputatationHere = mat.get( i, j);
}
}
return mat.get(s.length(), t.length() );
}
public String explainScore(StringWrapper s,StringWrapper t) {
mat = new MyMatrix( s, t );
double d = mat.get(s.length(), t.length() );
mat.setPrintNegativeValues(true);
return mat.toString() + "\nScore = "+d;
}
/** Find a character in the first string, s, that can be aligned
* with the i-th character in the second string, t. */
public int getAlignedChar(int iMinusOne,boolean preferHigherIndices)
{
// internally to this package, strings are indexed 1...N, so
// we need to convert from the usual 0...N-1 Java convention
int i = iMinusOne+1;
int bestJ = -1;
double bestScore = -Double.MAX_VALUE;
for (int j=mat.getFirstStoredEntryInRow(i); j<=mat.getLastStoredEntryInRow(i); j++) {
if (mat.outOfRange(i,j)) log.error("out of range: "+i+","+j);
double score = mat.get(i,j);
if ((score>bestScore) || (score==bestScore && preferHigherIndices)) {
bestScore = score; bestJ = j;
}
}
// convert back to the usual 0...N-1 Java convention
return bestJ-1;
}
private class MyMatrix extends ApproxMemoMatrix
{
public MyMatrix(StringWrapper s,StringWrapper t) {
super(s,t,width,-Double.MAX_VALUE);
}
public double compute(int i,int j) {
if (i==0) return -j*gapCost;
if (j==0) return -i*gapCost;
return max3( get(i-1,j-1) + charMatchScore.matchScore( sAt(i), tAt(j) ),
get(i-1, j) - gapCost,
get(i, j-1) - gapCost);
}
}
static public void main(String[] argv) throws Exception
{
if (argv.length==3) {
// -f file1 file2
String s = readFile(new File(argv[1]));
String t = readFile(new File(argv[2]));
long t0 = System.currentTimeMillis();
double score = new ApproxNeedlemanWunsch().score(s,t);
long tf = System.currentTimeMillis();
System.out.println("score = "+score+" runtime = "+((tf-t0)/1000.0)+" sec");
} else {
doMain(new ApproxNeedlemanWunsch(), argv);
}
}
private static String readFile(File in) throws IOException
{
InputStream inputStream = new FileInputStream(in);
byte[] bytes = new byte[inputStream.available()];
inputStream.read(bytes);
inputStream.close();
return new String(bytes);
}
}
|