package com.wcohen.ss.expt; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import com.wcohen.ss.abbvGapsHmm.Acronym; import com.wcohen.ss.abbvGapsHmm.AlignmentPredictionModel; /** * Extracts abbreviation pairs (<short-form, long-form>) from text using an 'abbreviation distance metric' which evaluates * the probability of a short-form string being an abbreviation/acronym of another long-form string. * The probability is given by an HMM-based alignment between the two strings. *

* Sample command line:
* java com.wcohen.ss.expt.ExtractAbbreviations ./train/abbvAlign_corpus.txt experiment_name *

* Citation: Dana Movshovitz-Attias and William Cohen, Alignment-HMM-based Extraction of Abbreviations from Biomedical Text, 2012, BioNLP in NAACL * * @see com.wcohen.ss.AbbreviationAlignment * @author Dana Movshovitz-Attias * */ public class ExtractAbbreviations { public class Stats { public int FN, FP, TP, TN; public float precision, recall, F1; public Stats(){ FN = 0; FP = 0; TN = 0; FP = 0; precision = 0f; recall = 0f; F1 = 0f; } } public static String SEPARATOR = "#_#"; private String _input; private String _output; private String _gold; private String _train = "./train"; private AlignmentPredictionModel _alignPredictor = null; private Map _strToID = null; private Map> _idToStr = null; private Map _strToSrc = null; public ExtractAbbreviations(String input, String output, String train, String gold) { _input = input; _output = output; _train = train; _gold = gold; } public void run() throws IOException { loadPredictor(); setTrainDir(_train); predictAndTest(AlignmentPredictionModel.loadTrainingCorpus(_input), AlignmentPredictionModel.loadLabels(_gold)); } protected void mkdir(String dir) { File f = new File(dir); f.mkdirs(); } protected void setTrainDir(String trainDir) { _alignPredictor.setTrainingDataDir(trainDir+"/"); _alignPredictor.setModelParamsFile(trainDir+"/hmmModelParams.txt"); _alignPredictor.trainIfNeeded(); } protected AlignmentPredictionModel loadPredictor(){ if(_alignPredictor == null){ try { _alignPredictor = new AlignmentPredictionModel(); } catch (IOException e) { System.err.println("Unable to load AlignmentPredictionModel"); e.printStackTrace(); System.exit(1); } } return _alignPredictor; } protected void predictAndTest(List corpus, List> trueLabels) throws IOException{ Stats totalStats = new Stats(); String output_abbvs = "./"+_output+"_abbvs"; String output_strings = "./"+_output+"_strings"; BufferedWriter bw_abbvs = new BufferedWriter(new FileWriter(output_abbvs)); BufferedWriter bw_strings = new BufferedWriter(new FileWriter(output_strings)); _strToID = new HashMap(); _idToStr = new HashMap>(); _strToSrc = new HashMap(); // iterate over all documents in the corpus for(int docID = 0; docID < corpus.size(); ++docID){ Stats currStats = predictAndTest(docID, corpus, trueLabels, bw_abbvs); if(trueLabels!= null){ totalStats.TP += currStats.TP; totalStats.FP += currStats.FP; totalStats.FN += currStats.FN; totalStats.precision += currStats.precision; totalStats.recall += currStats.recall; totalStats.F1 += currStats.F1; } } outputPairs(bw_strings); bw_abbvs.close(); bw_strings.close(); if(trueLabels!= null){ System.out.println("Avg TP: "+(totalStats.TP / (double)corpus.size())); System.out.println("Avg FP: "+(totalStats.FP / (double)corpus.size())); System.out.println("Avg Precision: "+(totalStats.precision / (double)corpus.size())); System.out.println("Avg Recall: "+(totalStats.recall / (double)corpus.size())); System.out.println("Avg F1: "+(totalStats.F1 / (double)corpus.size())); float tot_precision, tot_recall, tot_F1; if(totalStats.TP+totalStats.FP == 0){ tot_precision = 1f; } else{ tot_precision = new Float(totalStats.TP) / new Float(totalStats.TP+totalStats.FP); } tot_recall = totalStats.TP / new Float(totalStats.TP+totalStats.FN); tot_F1 = 2* ((tot_precision*tot_recall) / (tot_precision+tot_recall)); System.out.println("Total Precision: "+(tot_precision / (double)corpus.size())); System.out.println("Total Recall: "+(tot_recall / (double)corpus.size())); System.out.println("Total F1: "+(tot_F1 / (double)corpus.size())); } } protected String outputAbbvs(Map predictions) { String out = ""; for (String sf : predictions.keySet()) { String lf = predictions.get(sf)._longForm; out += sf + "\t" + lf + "#_#"; } return out; } protected void addAbbreviationPairs(Map predictions) { for (String sf : predictions.keySet()) { String lf = predictions.get(sf)._longForm; Integer sf_id = _strToID.get(sf); Integer lf_id = _strToID.get(lf); if (sf_id == null && lf_id == null){ Integer id = _strToID.size(); _strToID.put(sf, id); _strToID.put(lf, id); _idToStr.put(id, new HashSet()); _idToStr.get(id).add(sf); _idToStr.get(id).add(lf); } else if (sf_id == null && lf_id != null) { _strToID.put(sf, lf_id); _idToStr.get(lf_id).add(sf); } else if (lf_id == null && sf_id != null) { _strToID.put(lf, sf_id); _idToStr.get(sf_id).add(lf); } else if (sf_id != lf_id) { _strToID.put(lf, sf_id); for (String str : _idToStr.get(lf_id)) { _strToID.put(str, sf_id); _idToStr.get(sf_id).add(str); } _idToStr.remove(lf_id); } _strToSrc.put(sf, "short"); _strToSrc.put(lf, "long"); } } protected void outputPairs(BufferedWriter bw) throws IOException { Integer ids[] = _idToStr.keySet().toArray(new Integer[0]); for (int newId = 0; newId < ids.length; newId++) { int oldId = ids[newId]; for (String str : _idToStr.get(oldId)) { bw.write(_strToSrc.get(str) + "\t" + newId + "\t" + str + "\n"); } } } protected Stats predictAndTest(int docID, List corpus, List> trueLabels, BufferedWriter bw_abbvs) throws IOException { // predict String text = corpus.get(docID); Collection all_predictions = _alignPredictor.predict(text); Map final_predictions = _alignPredictor.acronymsArrayToMap(all_predictions); bw_abbvs.write(outputAbbvs(final_predictions)+"\n"); addAbbreviationPairs(final_predictions); // test if(trueLabels != null){ Map docTrueLabels = trueLabels.get(docID); Stats stats = new Stats(); stats.FN = docTrueLabels.size(); stats.TP = 0; stats.FP = 0; for (String shortFort : final_predictions.keySet()) { String predictedLongForm = final_predictions.get(shortFort)._longForm; if(predictedLongForm == null){ stats.FP++; } else{ String trueLongForm = docTrueLabels.get(shortFort); if(predictedLongForm.toLowerCase().equals(trueLongForm.toLowerCase())){ stats.FP++; } else{ stats.TP++; stats.FN--; } } } if(stats.TP+stats.FP == 0){ stats.precision = 1f; } else{ stats.precision = new Float(stats.TP) / new Float(stats.TP+stats.FP); } stats.recall = stats.TP / new Float(stats.TP+stats.FN); stats.F1 = 2 * ((stats.precision*stats.recall) / (stats.precision+stats.recall)); return stats; } return null; } /** * Extracts abbreviation pairs from text.

* Usage: ExtractAbbreviations input experiment_name [gold-file] [train-dir] */ public static void main(String[] args) { if(args.length < 2){ System.out.println("Usage: ExtractAbbreviations input experiment_name [gold-file] [train-dir] \n\n"+ "input - Corpus file (one line per file) from which abbreviations will be extracted.\n"+ "experiment_name - The experiment name will be used to create these output files:\n"+ " './_abbvs' - contains the abbreviations extracted from the corpus, in a format similar to './train/abbvAlign_pairs.txt', "+ "the abbreviations from each document are concatenated to one line.\n"+ " './_strings' - contains pairs of short and long forms of abbreviations extracted from the corpus, "+ "in a format that can be used for a matching experiment (using MatchExpt, AbbreviationsBlocker, and AbbreviationAlignment distance)."+ "train - Optional. Directory containing a corpus file named 'abbvAlign_corpus.txt' for training the abbreviation HMM. "+ "Corpus format is one line per file.\n"+ " The model parameters will be saved in this directory under 'hmmModelParams.txt' so the HMM will only have to be trained once.\n"+ " Default = './train/'\n"+ "gold - Optional. If available, the gold data will be used to estimate the performance of the HMM on the input corpus.\n"+ " './train/abbvAlign_pairs.txt' is a sample gold file for the 'train/abbvAlign_corpus.txt corpus.'\n"+ " Default = by default, no gold data is given and no estimation is done." ); System.exit(1); } String input = args[0]; String output = args[1]; String gold = null; if(args.length > 2) gold = args[2]; String train = "./train"; if(args.length > 3) train = args[3]; ExtractAbbreviations tester = new ExtractAbbreviations(input, output, train, gold); try { tester.run(); } catch (IOException e) { e.printStackTrace(); } } }