1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
|
package net.sourceforge.segment.srx;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
/**
* Represents merged splitting pattern.
* Responsible for merging breaking rules into one large pattern and
* creating non breaking rules pattern.
* @author loomchild
*/
public class MergedPattern {
private Pattern breakingPattern;
private List<Pattern> nonBreakingPatternList;
public MergedPattern(List<LanguageRule> languageRuleList) {
this.nonBreakingPatternList = new ArrayList<Pattern>();
StringBuilder breakingPatternBuilder = new StringBuilder();
List<Rule> ruleList = extractRules(languageRuleList);
List<List<Rule>> ruleGroupList = groupRules(ruleList);
if (ruleGroupList.size() > 0) {
for (List<Rule> ruleGroup : ruleGroupList) {
if (breakingPatternBuilder.length() > 0) {
breakingPatternBuilder.append('|');
}
// All breaking rules need to be merged because segmentation
// need to be done in one pass when text is read from Reader.
// Breaking rule need to be inside capturing group so
// it is possible to recognize which breaking rule has been
// applied during the splitting and know which non-breaking
// rules to use.
// Breaking rule cannot contain capturing groups.
// Capturing groups are replaced with non-capturing groups
// inside create breaking pattern function.
String breakingGroupPattern = createBreakingPattern(ruleGroup);
breakingPatternBuilder.append("(" + breakingGroupPattern + ")");
// If first rule in the group is breaking then there are
// no non-breaking rules in the group. In this case null
// is appended to non breaking pattern list, because
// null does not match anything.
if (!ruleGroup.get(0).isBreaking()) {
Pattern nonBreakingGroupPattern =
Pattern.compile(createNonBreakingPattern(ruleGroup));
nonBreakingPatternList.add(nonBreakingGroupPattern);
} else {
nonBreakingPatternList.add(null);
}
}
this.breakingPattern = Pattern.compile(breakingPatternBuilder
.toString());
} else {
// null means that that pattern will not match anything
// (as empty pattern matches everything).
this.breakingPattern = null;
}
}
public Pattern getBreakingPattern() {
return breakingPattern;
}
public List<Pattern> getNonBreakingPatternList() {
return nonBreakingPatternList;
}
/**
* @param languageRuleList
* Language rule list.
* @return Returns merged list of rules form given language rules.
*/
private List<Rule> extractRules(List<LanguageRule> languageRuleList) {
List<Rule> ruleList = new ArrayList<Rule>();
for (LanguageRule languageRule : languageRuleList) {
ruleList.addAll(languageRule.getRuleList());
}
return ruleList;
}
/**
* Divides rules to groups where breaking and non-breaking rules cannot be
* interlaced.
*
* @param ruleList
* @return
*/
private List<List<Rule>> groupRules(List<Rule> ruleList) {
List<List<Rule>> ruleGroupList = new ArrayList<List<Rule>>();
List<Rule> ruleGroup = new ArrayList<Rule>();
boolean previousBreaking = false;
for (Rule rule : ruleList) {
if (rule.isBreaking() && !previousBreaking) {
ruleGroupList.add(ruleGroup);
} else if (!rule.isBreaking() && previousBreaking) {
ruleGroup = new ArrayList<Rule>();
}
ruleGroup.add(rule);
previousBreaking = rule.isBreaking();
}
return ruleGroupList;
}
/**
* Merges all breaking rules on list into one pattern.
*
* @param ruleList
* List of rules
* @return Returns pattern.
*/
private String createBreakingPattern(List<Rule> ruleList) {
StringBuilder patternBuilder = new StringBuilder();
for (Rule rule : ruleList) {
if (rule.isBreaking()) {
if (patternBuilder.length() > 0) {
patternBuilder.append('|');
}
// Capturing groups need to be removed from patterns as
// they will interfere with capturing group order
// which is used to recognize which breaking rule has been
// applied and decide which non-breaking rules to use.
// In addition as Java does not allow infinite length patterns
// in lookbehind, before pattern need to be shortened.
String beforePattern = Util.finitize(
Util.removeCapturingGroups(rule.getBeforePattern()));
String afterPattern = Util.removeCapturingGroups(rule
.getAfterPattern());
// Need to use lookbehind also in breaking rule because
// this way they become zero-length matches and I want
// to match shorter rules first, independent of occurrence order
// as in normal alternative.
// Example:
// Input: "aaa".
// Pattern "aa|a" matches "aa" first.
// Pattern "(?<=aa)|(?<=a)" matches "a" first.
if (beforePattern.length() > 0) {
patternBuilder.append("(?<=" + beforePattern + ")");
}
if (afterPattern.length() > 0) {
patternBuilder.append("(?=" + afterPattern + ")");
}
}
}
return patternBuilder.toString();
}
/**
* Creates non breaking pattern by merging given rules.
*
* @param ruleList
* Rule list.
* @return Non breaking pattern.
*/
private String createNonBreakingPattern(List<Rule> ruleList) {
StringBuilder patternBuilder = new StringBuilder();
for (Rule rule : ruleList) {
if (!rule.isBreaking()) {
if (patternBuilder.length() > 0) {
patternBuilder.append('|');
}
// As Java does not allow infinite length patterns
// in lookbehind, before pattern need to be shortened.
String beforePattern = Util.finitize(rule.getBeforePattern());
String afterPattern = rule.getAfterPattern();
if (beforePattern.length() > 0) {
patternBuilder.append("(?<=" + beforePattern + ")");
}
if (afterPattern.length() > 0) {
patternBuilder.append("(?=" + afterPattern + ")");
}
}
}
return patternBuilder.toString();
}
}
|