同义词功能在全文搜索时的意义,大家应该都懂的。今天中文我就试着写了一个同义词分词的示例demo,其实主要代码还是参考Lucene in Action 这本英文版书籍的随书代码,只不过Lucenen in Action书里的示例代码目前最新版只支持到Lucene4.x,对于Lucene5.x,代码需要稍作修改,下面是基于Lucene5.x的自定义同义词分词器demo:
- package com.yida.framework.lucene5.analyzer.synonym;
- import java.io.IOException;
- /**
- * 同义词提取引擎
- * @author Lanxiaowei
- *
- */
- public interface SynonymEngine {
- String[] getSynonyms(String s) throws IOException;
- }
- package com.yida.framework.lucene5.analyzer.synonym;
- import java.io.IOException;
- import java.util.HashMap;
- public class BaseSynonymEngine implements SynonymEngine {
- private static HashMap<String, String[]> map = new HashMap<String, String[]>();
- {
- map.put("quick", new String[] {"fast","speedy"});
- map.put("jumps", new String[] {"leaps","hops"});
- map.put("over", new String[] {"above"});
- map.put("lazy", new String[] {"apathetic","slugish"});
- map.put("dog", new String[] {"canine","pooch"});
- }
- public String[] getSynonyms(String s) throws IOException {
- return map.get(s);
- }
- }
- package com.yida.framework.lucene5.analyzer.synonym;
- import java.io.IOException;
- import java.util.Stack;
- import org.apache.lucene.analysis.TokenFilter;
- import org.apache.lucene.analysis.TokenStream;
- import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
- import org.apache.lucene.util.AttributeSource;
- /**
- * 自定义同义词过滤器
- *
- * @author Lanxiaowei
- *
- */
- public class SynonymFilter extends TokenFilter {
- public static final String TOKEN_TYPE_SYNONYM = "SYNONYM";
- private Stack<String> synonymStack;
- private SynonymEngine engine;
- private AttributeSource.State current;
- private final CharTermAttribute termAtt;
- private final PositionIncrementAttribute posIncrAtt;
- public SynonymFilter(TokenStream in, SynonymEngine engine) {
- super(in);
- synonymStack = new Stack<String>(); // #1
- this.engine = engine;
- this.termAtt = addAttribute(CharTermAttribute.class);
- this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- }
- public boolean incrementToken() throws IOException {
- if (synonymStack.size() > 0) { // #2
- String syn = synonymStack.pop(); // #2
- restoreState(current); // #2
- // 这里Lucene4.x的写法
- // termAtt.setTermBuffer(syn);
- // 这是Lucene5.x的写法
- termAtt.copyBuffer(syn.toCharArray(), 0, syn.length());
- posIncrAtt.setPositionIncrement(0); // #3
- return true;
- }
- if (!input.incrementToken()) // #4
- return false;
- if (addAliasesToStack()) { // #5
- current = captureState(); // #6
- }
- return true; // #7
- }
- private boolean addAliasesToStack() throws IOException {
- // 这里Lucene4.x的写法
- // String[] synonyms = engine.getSynonyms(termAtt.term()); //#8
- // 这里Lucene5.x的写法
- String[] synonyms = engine.getSynonyms(termAtt.toString()); // #8
- if (synonyms == null) {
- return false;
- }
- for (String synonym : synonyms) { // #9
- synonymStack.push(synonym);
- }
- return true;
- }
- }
- /*
- #1 Define synonym buffer
- #2 Pop buffered synonyms
- #3 Set position increment to 0
- #4 Read next token
- #5 Push synonyms onto stack
- #6 Save current token
- #7 Return current token
- #8 Retrieve synonyms
- #9 Push synonyms onto stack
- */
- package com.yida.framework.lucene5.analyzer.synonym;
- import java.io.BufferedReader;
- import java.io.Reader;
- import java.io.StringReader;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.TokenStream;
- import org.apache.lucene.analysis.Tokenizer;
- import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
- import org.apache.lucene.analysis.core.LetterTokenizer;
- import org.apache.lucene.analysis.core.LowerCaseFilter;
- import org.apache.lucene.analysis.core.StopAnalyzer;
- import org.apache.lucene.analysis.core.StopFilter;
- import org.apache.lucene.analysis.standard.StandardFilter;
- import org.apache.lucene.analysis.standard.StandardTokenizer;
- import com.yida.framework.lucene5.util.analyzer.codec.MetaphoneReplacementFilter;
- /**
- * 自定义同义词分词器
- *
- * @author Lanxiaowei
- * @createTime 2015-03-31 10:15:23
- */
- public class SynonymAnalyzer extends Analyzer {
- private SynonymEngine engine;
- public SynonymAnalyzer(SynonymEngine engine) {
- this.engine = engine;
- }
- @Override
- protected TokenStreamComponents createComponents(String text) {
- Tokenizer tokenizer = new StandardTokenizer();
- TokenStream tokenStream = new SynonymFilter(tokenizer, engine);
- tokenStream = new LowerCaseFilter(tokenStream);
- tokenStream = new StopFilter(tokenStream,StopAnalyzer.ENGLISH_STOP_WORDS_SET);
- return new TokenStreamComponents(tokenizer, tokenStream);
- }
- }
- package com.yida.framework.lucene5.analyzer.synonym;
- import java.io.IOException;
- import org.apache.lucene.analysis.Analyzer;
- import com.yida.framework.lucene5.util.AnalyzerUtils;
- public class SynonymAnalyzerTest {
- public static void main(String[] args) throws IOException {
- String text = "The quick brown fox jumps over the lazy dog";
- Analyzer analyzer = new SynonymAnalyzer(new BaseSynonymEngine());
- AnalyzerUtils.displayTokens(analyzer, text);
- }
- }
- package com.yida.framework.lucene5.util;
- import java.io.IOException;
- import junit.framework.Assert;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.TokenStream;
- import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
- import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
- import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
- /**
- * 用于分词器测试的一个简单工具类(用于打印分词情况,包括Term的起始位置和结束位置(即所谓的偏 * 移量),位置增量,Term字符串,Term字符串类型(字符串/阿拉伯数字之类的))
- * @author Lanxiaowei
- *
- */
- public class AnalyzerUtils {
- public static void displayTokens(Analyzer analyzer,String text) throws IOException {
- TokenStream tokenStream = analyzer.tokenStream("text", text);
- displayTokens(tokenStream);
- }
- public static void displayTokens(TokenStream tokenStream) throws IOException {
- OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
- PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
- CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
- TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
- tokenStream.reset();
- int position = 0;
- while (tokenStream.incrementToken()) {
- int increment = positionIncrementAttribute.getPositionIncrement();
- if(increment > 0) {
- position = position + increment;
- System.out.print(position + ":");
- }
- int startOffset = offsetAttribute.startOffset();
- int endOffset = offsetAttribute.endOffset();
- String term = charTermAttribute.toString();
- System.out.println("[" + term + "]" + ":(" + startOffset + "-->" + endOffset + "):" + typeAttribute.type());
- }
- }
- /**
- * 断言分词结果
- * @param analyzer
- * @param text 源字符串
- * @param expecteds 期望分词后结果
- * @throws IOException
- */
- public static void assertAnalyzerTo(Analyzer analyzer,String text,String[] expecteds) throws IOException {
- TokenStream tokenStream = analyzer.tokenStream("text", text);
- CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
- for(String expected : expecteds) {
- Assert.assertTrue(tokenStream.incrementToken());
- Assert.assertEquals(expected, charTermAttribute.toString());
- }
- Assert.assertFalse(tokenStream.incrementToken());
- tokenStream.close();
- }
- }
以上代码都是Lucene in Action这本书里面的示例代码,我只不过是基于Lucene5.x把它重写并调试成功了,特此分享,希望对正在学习Lucene5的童鞋们有所帮助。demo代码我会在底下附件里上传,有需要demo源码的请自己在底下的附件里下载,Lucene in Action这本书的随书源码我已上传到我的百度网盘,也一并分享给大家,Lucene in Action随书源码百度网盘下载地址:
