Lucene 3.6.2入门(10) Tika

首先贴出来的是演示了借助Tika创建索引的HelloTikaIndex.java

PS:关于Tika的介绍及用法,详 见下方的HelloTika.java

package com.jadyer.lucene;  

import java.io.File;
import java.io.IOException;  

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.Tika;  

import com.chenlb.mmseg4j.analysis.ComplexAnalyzer;  

/**
 * 【Lucene3.6.2入门系列】第10节_Tika
 * @create Aug 19, 2013 11:02:21 PM
 * @author 玄玉<http://blog.csdn.net/jadyer>
 */
public class HelloTikaIndex {
    private Directory directory;
    private IndexReader reader;  

    public HelloTikaIndex(){
        try {
            directory = FSDirectory.open(new File("myExample/myIndex/"));
        } catch (IOException e) {
            e.printStackTrace();
        }
    }  

    /**
     * 创建索引
     */
    public void createIndex(){
        Document doc = null;
        IndexWriter writer = null;
        File myFile = new File("myExample/myFile/");
        try{
            //这里的分词器使用的是MMSeg4j(记得引入mmseg4j-all-1.8.5-with-dic.jar)
            //详见http://blog.csdn.net/jadyer/article/details/10049525中对MMSeg4j的介绍
            writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_36, new 

ComplexAnalyzer()));
            writer.deleteAll();
            for(File file : myFile.listFiles()){
                doc = new Document();
//              //当保存文件的Metadata时,要过滤掉文件夹,否则会报告文件夹无法访问的异常
//              if(file.isDirectory()){
//                  continue;
//              }
//              Metadata metadata = new Metadata();
//              doc.add(new Field("filecontent", new Tika().parse(new FileInputStream(file), 

metadata)));
                doc.add(new Field("filecontent", new Tika().parse(file)));
                doc.add(new Field("filename", file.getName(), Field.Store.YES, 

Field.Index.NOT_ANALYZED));
                writer.addDocument(doc);
            }
        }catch(Exception e) {
            e.printStackTrace();
        }finally{
            if(null != writer){
                try {
                    writer.close();
                } catch (IOException ce) {
                    ce.printStackTrace();
                }
            }
        }
    }  

    /**
     * 获取IndexSearcher实例
     */
    private IndexSearcher getIndexSearcher(){
        try {
            if(reader == null){
                reader = IndexReader.open(directory);
            }else{
                //if the index was changed since the provided reader was opened, open and return 

a new reader; else,return null
                //如果当前reader在打开期间index发生改变,则打开并返回一个新的IndexReader,否则返回

null
                IndexReader ir = IndexReader.openIfChanged(reader);
                if(ir != null){
                    reader.close(); //关闭原reader
                    reader = ir;    //赋予新reader
                }
            }
            return new IndexSearcher(reader);
        }catch(Exception e) {
            e.printStackTrace();
        }
        return null; //发生异常则返回null
    }  

    /**
     * 执行搜索操作
     * @param fieldName 域名(相当于表的字段名)
     * @param keyWords  搜索的关键字
     */
    public void searchFile(String fieldName, String keyWords){
        IndexSearcher searcher = this.getIndexSearcher();
        Query query = new TermQuery(new Term(fieldName, keyWords));
        try {
            TopDocs tds = searcher.search(query, 50);
            for(ScoreDoc sd : tds.scoreDocs){
                Document doc = searcher.doc(sd.doc);
                System.out.print("文档编号=" + sd.doc + "  文档权值=" + doc.getBoost() + "  文档

评分=" + sd.score + "    ");
                System.out.println("filename=" + doc.get("filename"));
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if(null != searcher){
                try {
                    searcher.close(); //记得关闭IndexSearcher
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }  

    /**
     * 测试一下效果
     * @see 测试前记得在myExample/myFile/目录下预先准备几个doc,pdf,html,txt等文件
     */
    public static void main(String[] args) {
        HelloTikaIndex hello = new HelloTikaIndex();
        hello.createIndex();
        hello.searchFile("filecontent", "java");
    }
}

以上是小编为您精心准备的的内容,在的博客、问答、公众号、人物、课程等栏目也有的相关内容,欢迎继续使用右上角搜索按钮进行搜索lucene
, apache
, new
, import
, apache tika
, mmseg4j
, tika
, reader
, indexreader
tika pdf
tika 全文检索 lucene、lucene tika、lucene 3.6 jar包下载、lucene 3.6、lucene 3.6.2 下载,以便于您获取更多的相关知识。

时间: 2024-12-10 01:02:16

Lucene 3.6.2入门(10) Tika的相关文章

Lucene 3.6.2入门(11) 高亮

package com.jadyer.lucene; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; i

Lucene 3.6.2入门(13) Solr3.6.2简介以及整合Tomcat

/** * [Lucene3.6.2入门系列]第13节_Solr3.6.2简介以及整合Tomcat * @see Solr是一个高性能的,采用Java5开发的,基于HTTP和Lucene实现的开源企业级全文搜索引擎 * @see 可以粗暴的理解为:Lucene专注于搜索底层实现及算法优化,Solr专注于高层次的为企业服务的易于使 用和管理的搜索服务器 * @see 再粗暴一点理解为:Lucene和Solr的关系类似于,Ajax和jQuery,Servlet和Struts2,JAX-WS和CXF,

Lucene 3.6.2入门(12) 近实时搜索

package com.jadyer.lucene; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.ind

Lucene 3.6.2入门(9) 高级搜索之自定义QueryParser

package com.jadyer.lucene; import java.io.File; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.

Lucene 3.6.2入门(8) 高级搜索之自定义评分

  package com.jadyer.lucene; import java.io.File; import java.io.IOException; import java.util.Random; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; i

Lucene 3.6.2入门(7) 高级搜索之普通Filter和自定义Filter

package com.jadyer.lucene; import java.io.File; import java.io.IOException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.luce

Lucene 3.6.2入门(6) 高级搜索之排序

package com.jadyer.lucene; import java.io.File; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.

Lucene 3.6.2入门(2) 针对索引文件的CRUD

package com.jadyer.lucene; import java.io.File; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.

Lucene 3.6.2入门(1) 第一个程序:Hello Word

package com.jadyer.lucene; import java.io.File; import java.io.FileReader; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; i