首页 > java > lucene3相关文章搜索的实现

lucene3相关文章搜索的实现

2010年4月27日 发表评论 阅读评论
TermVector是Lucene 1.4新增的,TermVector保存Token.getPositionIncrement() 和Token.startOffset() 以及Token.endOffset() 信息.
各种参数说明:
Field.TermVector.NO:不保存term vectors
Field.TermVector.YES:保存term vectors
Field.TermVector.WITH_POSITIONS:保存term vectors.(保存值和token位置信息)
Field.TermVector.WITH_OFFSETS:保存term vectors.(保存值和Token的offset)
Field.TermVector.WITH_POSITIONS_OFFSETS:保存term vectors.(保存值和token位置信息和Token的offset)
 
lucene相关文章搜索的实现的代码
public class MoreLike {

       Analyzer analyzer = new IKAnalyzer(); //分词器选择

       Directory ramDir = new RAMDirectory();

 

       public void createRamIndex() throws CorruptIndexException,

                     LockObtainFailedException, IOException {

 

              IndexWriter writer = new IndexWriter(ramDir, analyzer,

                            IndexWriter.MaxFieldLength.LIMITED);
 

              Document doc1 = new Document();

              doc1.add(new Field("title", "wenhq", Field.Store.YES, Field.Index.ANALYZED));

              doc1.add(new Field("author", "callan", Field.Store.YES, Field.Index.ANALYZED));

              doc1.add(new Field("subject",

                            "wenhq.com是亲亲宝宝网站的域名,记录软件开发的经验", Field.Store.YES,

                            Field.Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));

 

              Document doc2 = new Document();

              doc2.add(new Field("title", "english", Field.Store.YES, Field.Index.ANALYZED));

              doc2.add(new Field("author", "wcq", Field.Store.YES, Field.Index.ANALYZED));

              doc2.add(new Field("subject", "学习english的人很多,亲亲宝宝网站的人也在学习", Field.Store.YES, Field.Index.ANALYZED,

                            TermVector.WITH_POSITIONS_OFFSETS));
 

              Document doc3 = new Document();

              doc3.add(new Field("title", "asp", Field.Store.YES, Field.Index.ANALYZED));

              doc3.add(new Field("author", "ca", Field.Store.YES, Field.Index.ANALYZED));

              doc3.add(new Field("subject", "asp是一种网站开发语言", Field.Store.YES, Field.Index.ANALYZED,

                            TermVector.WITH_POSITIONS_OFFSETS));
 
              writer.addDocument(doc1);
              writer.addDocument(doc2);
              writer.addDocument(doc3);
 
              writer.optimize();
              writer.close();
       }
 

       public void search() throws CorruptIndexException, IOException {

              IndexReader reader = IndexReader.open(ramDir);

              IndexSearcher searcher = new IndexSearcher(reader);

              Term term = new Term("title", "wenhq"); // 在title里查询wenhq词条

              TermQuery query = new TermQuery(term);

              TopScoreDocCollector collector = TopScoreDocCollector.create(10000,

                            false);

              searcher.search(query, collector);

              ScoreDoc[] hits = collector.topDocs().scoreDocs;

              for (int i = 0; i < hits.length; i++) {

                     Document doc = searcher.doc(hits[i].doc);

                     System.out.println("search: ");

                     System.out.println(doc.get("title") + "###" + doc.get("subject"));

                     morelikeSearch(reader, hits[i].doc);

              }
       }
 

       private void morelikeSearch(IndexReader reader, int id) throws IOException {

              System.out.println("moreLike search: ");

              // 根据这个document的id获取这个field的Term Vector

              // 信息,就是这个field分词之后在这个field里的频率、位置、等信息

              TermFreqVector vector = reader.getTermFreqVector(id, "subject");

 

              BooleanQuery query = new BooleanQuery();

 

              for (int i = 0; i < vector.size(); i++) {

                     TermQuery tq = new TermQuery(new Term("subject",

                                   vector.getTerms()[i])); // 获取每个term保存的Token

 

                     query.add(tq, BooleanClause.Occur.SHOULD);

              }
 

              IndexSearcher searcher = new IndexSearcher(ramDir);

              TopScoreDocCollector collector = TopScoreDocCollector.create(10000,

                            false);

              searcher.search(query, collector);

              ScoreDoc[] hits = collector.topDocs().scoreDocs;

              for (int i = 0; i < hits.length; i++) {

                     Document doc = searcher.doc(hits[i].doc);

                     System.out.println(doc.get("title") + "###" + doc.get("subject"));

              }
       }
      

       public static void main(String[] args) throws CorruptIndexException,

                     IOException {

              MoreLike t = new MoreLike();

              t.createRamIndex();
              t.search();
       }
}
具体的输出结果:
search:需要查询的文章内容
wenhq###wenhq.com是亲亲宝宝网站的域名,记录软件开发的经验
moreLike search: 相关的文章
wenhq###wenhq.com是亲亲宝宝网站的域名,记录软件开发的经验
english###学习english的人很多,亲亲宝宝网站的人也在学习
asp###asp是一种网站开发语言
分类: java 标签: 1,945 次阅读
原文链接:http://www.wenhq.com/article/view_447.html
欢迎转载,请注明出处:亲亲宝宝
  1. 本文目前尚无任何评论.
  1. 本文目前尚无任何 trackbacks 和 pingbacks.