1,增加solr拼音查询原理:
pinyin4j-2.5.0.jar
下载地址:
http://sourceforge.net/projects/pinyin4j/
solr环境使用3.6.2。
2,Token代码:
package com.freewebsys.index.analysis; import net.sourceforge.pinyin4j.PinyinHelper; import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType; import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat; import net.sourceforge.pinyin4j.format.HanyuPinyinToneType; import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType; import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination; import org.apache.commons.lang.StringUtils; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ngram.NGramTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import java.io.IOException; import java.io.Reader; import java.io.StringReader; public class PinyinTokenizer extends Tokenizer { private static final int DEFAULT_BUFFER_SIZE = 512; private boolean done = false; private int finalOffset; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat(); // 链接字符串. private String padding_char = " "; // 构造函数. public PinyinTokenizer(Reader reader) { this(reader, DEFAULT_BUFFER_SIZE); } public PinyinTokenizer(Reader input, int bufferSize) { super(input); termAtt.resizeBuffer(bufferSize); format.setCaseType(HanyuPinyinCaseType.LOWERCASE); format.setToneType(HanyuPinyinToneType.WITHOUT_TONE); format.setVCharType(HanyuPinyinVCharType.WITH_V); } @Override public final boolean incrementToken() throws IOException { if (!done) { clearAttributes(); done = true; int upto = 0; char[] buffer = termAtt.buffer(); System.out.println(String.valueOf(buffer)); while (true) { final int length = input.read(buffer, upto, buffer.length - upto); if (length == -1) break; upto += length; if (upto == buffer.length) buffer = termAtt.resizeBuffer(1 + buffer.length); } termAtt.setLength(upto); String str = termAtt.toString(); termAtt.setEmpty(); StringBuilder stringBuilder = new StringBuilder(); StringBuilder firstLetters = new StringBuilder(); StringBuilder cnLetters = new StringBuilder(); StringBuilder allPinYinLetters = new StringBuilder(); for (int i = 0; i < str.length(); i++) { char c = str.charAt(i); if (c < 128) { stringBuilder.append(c); } else { try { String[] strs = PinyinHelper.toHanyuPinyinStringArray( c, format); if (strs != null) { // get first result by default String first_value = strs[0]; // TODO more than one pinyin // 拼接中文字符. cnLetters.append(c); cnLetters.append(this.padding_char); // 全部拼音字符. allPinYinLetters.append(first_value); // 拼接拼音字符. stringBuilder.append(first_value); stringBuilder.append(this.padding_char); // 拼接首字母字符. firstLetters.append(first_value.charAt(0)); } } catch (BadHanyuPinyinOutputFormatCombination badHanyuPinyinOutputFormatCombination) { badHanyuPinyinOutputFormatCombination.printStackTrace(); } } } // let's join them termAtt.append(stringBuilder.toString()); termAtt.append(this.padding_char); termAtt.append(cnLetters.toString()); termAtt.append(this.padding_char); termAtt.append(firstLetters.toString()); termAtt.append(this.padding_char); // 将全部拼音分词成一个一个输入数据索引。 termAtt.append(mergeNGramPinYin(allPinYinLetters.toString())); finalOffset = correctOffset(upto); offsetAtt.setOffset(correctOffset(0), finalOffset); return true; } return false; } @Override public final void end() { // set final offset offsetAtt.setOffset(finalOffset, finalOffset); } @Override public void reset(Reader input) throws IOException { super.reset(input); this.done = false; } public static String mergeNGramPinYin(String allPinYin) { // 读取字符串 StringReader reader = new StringReader(allPinYin); // 设置临时变量 String[] pinYinBuffer = null; if (StringUtils.isNotBlank(allPinYin)) { // 设置数组长度 pinYinBuffer = new String[allPinYin.length()]; StringBuffer tmpAppendPinYin = new StringBuffer(); NGramTokenizer nGramTokenizer = new NGramTokenizer(reader); for (int i = 0; i < allPinYin.length(); i++) { try { // 循环递增 nGramTokenizer.incrementToken(); // 取得解析后的字符串 CharTermAttribute charTermAttribute = nGramTokenizer .getAttribute(CharTermAttribute.class); tmpAppendPinYin.append(String.valueOf( charTermAttribute.buffer()).trim()); // 每次都给数据赋值. pinYinBuffer[i] = tmpAppendPinYin.toString(); } catch (IOException e) { e.printStackTrace(); } } } // 将结果按照空格合并. return StringUtils.join(pinYinBuffer, " "); } }
PinyinTokenizerFactory:
package com.freewebsys.index.analysis; import org.apache.lucene.analysis.Tokenizer; import org.apache.solr.analysis.BaseTokenizerFactory; import java.io.Reader; /** */ public class PinyinTokenizerFactory extends BaseTokenizerFactory { @Override public Tokenizer create(Reader input) { return new PinyinTokenizer(input); } }
3,solr 的schema.xml配置:
对于 index和query分开配置:
<!-- standard_text.标准分词.创建使用pinyin分词,搜索不使用. --> <fieldType name="standard_text" class="solr.TextField" positionIncrementGap="100"> <analyzer type="index"> <tokenizer class="com.freewebsys.index.analysis.PinyinTokenizerFactory" /> <filter class="solr.LowerCaseFilterFactory" /> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1" /> </analyzer> <analyzer type="query"> <tokenizer class="solr.StandardTokenizerFactory" /> </analyzer> </fieldType>
4,使用分词效果:
如果在管理后台测试可以看到上面的分词结果,就说明pinyin分词配置好了。
5,不足:
目前看在搜索完成后,高亮显示还有点问题,因为索引里面增加了很多拼音。
局限性,只能简单对比较段的内容进行分词,比如商品,电影,图书等,要是文字太长使用NGramTokenizer 分词就会增加一堆没用的pinyin。
时间: 2024-10-25 11:31:59