1、使用Regex实现高亮关键词
1.1 实现
public class KeywordHighlighter {
/**
* 高亮前缀
*/
private static final String HIGHLIGHTER_PREFIX = "<span style=\"color:red\">";
/**
* 高亮后缀
*/
private static final String HIGHLIGHTER_SUFFIX = "</span>";
public static HighlighterResult highlightByRegex(String text, List<String> keywords) {
if (StringUtils.isBlank(text) || CollectionUtils.isEmpty(keywords)) {
return new HighlighterResult(text, new HashSet<>(0));
}
//按关键字字符长度倒序
keywords.sort((o1, o2) -> o2.length() - o1.length());
Set<String> matchedKeywords = new HashSet<>();
StringBuffer sbText = new StringBuffer();
for (String keyword : keywords) {
Pattern pattern = Pattern.compile(Pattern.quote(keyword), Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(text);
while (matcher.find()) {
matchedKeywords.add(keyword);
matcher.appendReplacement(sbText, HIGHLIGHTER_PREFIX + matcher.group() + HIGHLIGHTER_SUFFIX);
}
matcher.appendTail(sbText);
text = sbText.toString();
sbText.setLength(0);
}
return new HighlighterResult(text, matchedKeywords);
}
}
1.2 测试
@Test
public void highlightTest(){
String text = "有人担忧午睡后下午会感到更加困倦,影响工作效率;也有人认为是毫无意义的时间消耗。";
List<String> keywords = new ArrayList<>();
keywords.add("午睡");
keywords.add("午睡后");
keywords.add("工作秩序");
keywords.add("时间");
keywords.add("意思");
keywords.add("认真");
HighlighterResult highlightResult = com.ebs.framework.common.TrieTool.KeywordHighlighter.highlightByRegex(text, keywords);
System.out.println(highlightResult.getHighlighterText());
System.out.println(highlightResult.getMatchedKeywords());
}
结果: 匹配到午睡后的同时,也匹配了午睡。
有人担忧<span style="color:red"><span style="color:red">午睡</span>后</span>下午会感到更加困倦,影响工作效率;也有人认为是毫无意义的<span style="color:red">时间</span>消耗。
[午睡后, 时间, 午睡]
2、使用Trie树高亮关键字词
2.1 TreeNode
public class TrieNode {
Map<Character,TrieNode> children;
boolean isEntOfWord;
public TrieNode() {
children = new HashMap<>();
isEntOfWord = false;
}
}
2.2 Trie
public class Trie {
private final TrieNode root;
public Trie(){
root = new TrieNode();
}
public void insert(String word){
TrieNode node = root;
for(char c :word.toCharArray()){
node.children.putIfAbsent(c,new TrieNode());
node = node.children.get(c);
}
node.isEntOfWord = true;
}
public TrieNode getRoot(){
return root;
}
}
2.3 HighlighterResult
返回对象,包含高亮的文本以及具体匹配到的关键词
public class HighlighterResult {
private String highlighterText;
private Set<String> matchedKeywords;
public HighlighterResult(String highlighterText, Set<String> matchedKeywords) {
this.highlighterText = highlighterText;
this.matchedKeywords = matchedKeywords;
}
public String getHighlighterText() {
return highlighterText;
}
public Set<String> getMatchedKeywords() {
return matchedKeywords;
}
}
2.4 高亮
public class KeywordHighlighter {
private final Trie tire;
public KeywordHighlighter(List<String> keywords) {
tire = new Trie();
for(String keyword : keywords){
tire.insert(keyword);
}
}
public KeywordHighlighter(Trie tire){
this.tire = tire;
}
/**
* 高亮前缀
*/
private static final String HIGHLIGHTER_PREFIX = "<span style=\"color:red\">";
/**
* 高亮后缀
*/
private static final String HIGHLIGHTER_SUFFIX = "</span>";
/**
* 高亮显示
* @param text
* @return
*/
public HighlighterResult highlight(String text) {
TrieNode root = tire.getRoot();
int n = text.length();
Boolean[] highlight = new Boolean[n];
Set<String> matchedKeywords = new HashSet<>();
//查找、标记匹配的关键词的位置,并记录匹配的关键词
int i = 0;
while (i < n) {
TrieNode node = root;
int j = i;
int lastMatchPos = -1;
String lastMatchedKeyword = null;
while (j < n && node.children.containsKey(text.charAt(j))) {
node = node.children.get(text.charAt(j));
if (node.isEntOfWord) {
lastMatchPos = j;
lastMatchedKeyword = text.substring(i, j + 1);
}
j++;
}
if (lastMatchPos != -1) {
matchedKeywords.add(lastMatchedKeyword);
for (int k = i; k <= lastMatchPos; k++) {
highlight[k] = true;
}
//跳过已匹配的关键词
i = lastMatchPos + 1;
} else {
i++;
}
}
return highlight(text, highlight, matchedKeywords);
}
/**
* 根据标识高亮文字
* @param text
* @param highlight
* @param matchedKeywords
* @return
*/
private HighlighterResult highlight(String text,Boolean[] highlight,Set<String> matchedKeywords) {
StringBuilder sb = new StringBuilder();
boolean inHighlight = false;
for (int i = 0; i < text.length(); i++) {
boolean high = isHighlight(highlight[i]);
if (high && !inHighlight) {
sb.append(HIGHLIGHTER_PREFIX);
inHighlight = true;
} else if (!high && inHighlight) {
sb.append(HIGHLIGHTER_SUFFIX);
inHighlight = false;
}
sb.append(text.charAt(i));
}
if (inHighlight) {
sb.append(HIGHLIGHTER_SUFFIX);
}
return new HighlighterResult(sb.toString(), matchedKeywords);
}
/**
* 判断标识是否高亮
* @param highlight
* @return
*/
private boolean isHighlight(Boolean highlight) {
return Objects.nonNull(highlight) && highlight;
}
}
2.5 测试
@Test
public void tireHighlightTest(){
String text = "有人担忧午睡后下午会感到更加困倦,影响工作效率;也有人认为是毫无意义的时间消耗。";
List<String> keywords = new ArrayList<>();
keywords.add("午睡");
keywords.add("午睡后");
keywords.add("工作秩序");
keywords.add("时间");
keywords.add("意思");
keywords.add("认真");
com.ebs.framework.common.TrieTool.KeywordHighlighter highlighter = new com.ebs.framework.common.TrieTool.KeywordHighlighter(keywords);
HighlighterResult result = highlighter.highlight(text);
System.out.println(result.getHighlighterText());
System.out.println(result.getMatchedKeywords());
}
// 有人担忧<span style="color:red">午睡后</span>下午会感到更加困倦,影响工作效率;也有人认为是毫无意义的<span style="color:red">时间</span>消耗。
// [午睡后, 时间]