基本信息
源码名称:lucene搜索引擎
源码大小:6.68M
文件格式:.rar
开发语言:Java
更新时间:2016-04-13
   友情提示:(无需注册或充值,赞助后即可获取资源下载链接)

     嘿,亲!知识可是无价之宝呢,但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下,绝对物超所值哦!如有下载和支付问题,请联系我们QQ(微信同号):813200300

本次赞助数额为: 3 元 
   源码介绍


package lucene1;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.NodeList;
import org.htmlparser.visitors.TextExtractingVisitor;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;

public class ReaderFile {

public static String pdfFile(String filename)
throws IOException {
FileInputStream instream = new FileInputStream(filename);    // 根据指定文件创建输入流
PDFParser parser = new PDFParser( instream );                // 创建PDF解析器
parser.parse();                                              // 执行PDF解析过程

PDDocument pdfdocument = parser.getPDDocument();             // 获取解析器的PDF文档对象
PDFTextStripper pdfstripper = new PDFTextStripper();         // 生成PDF文档内容剥离器
String contenttxt = pdfstripper.getText(pdfdocument);        // 利用剥离器获取文档
pdfdocument.close();
System.out.println("文件长度 : " contenttxt.length() "\n");
return contenttxt;
}

public static String txtFile(String filename)
throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(filename)));
String line = new String();
String temp = new String();

while ((line = reader.readLine()) != null) {
temp = line;
}
reader.close();
return temp;
}

public static String docFile(String filename)
throws IOException {
FileInputStream in = new FileInputStream(new File(filename));
    
   WordExtractor extractor  = new WordExtractor(in);   // 创建WordExtractor
   String text = extractor.getText();                  // 对DOC文件进行提取
    
   return text;
}

public static String htmlFile(String filename)
throws Exception {
Parser parser = new Parser(filename);                              // 访问目标网站
         parser.setEncoding("utf-8");                             // 设置解析编码格式

         TextExtractingVisitor visitor = new TextExtractingVisitor (); // 生成文本内容抽取对象 
         
         NodeFilter textFilter = new NodeClassFilter(TextNode.class);  // 生成文本过滤器
         
         NodeList nodes = parser.extractAllNodesThatMatch(textFilter); // 利用文本过滤器解析文档
         String text = "";
         for (int i = 0; i < nodes.size(); i )
         {
             TextNode textnode = (TextNode) nodes.elementAt(i);        // 获取文本节点
             String line = textnode.toPlainTextString().trim();        // 转换成纯文本
             if (line.equals("")) continue;
             //System.out.println(line);
             text = line;
         }
         
         parser.visitAllNodesWith (visitor);                           // 访问网页所有节点 
        // System.out.println(text); 
         return text ; // 输出网页正文
}

public static String xlsFile(String filename)
throws IOException {
// 创建对指定Excel工作文件的引用
HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(filename));
HSSFSheet sheet = workbook.getSheetAt(0);             // 创建对工作表的引用。
String text = "";
for( int i =0 ; i < workbook.getNumberOfSheets() ; i ) // 循环取表单对象
{
text ="########## sheet:--" i " --########## \n";
sheet = workbook.getSheetAt(i);                      // 查阅文档的Sheet属性
if( sheet != null )
{
for(int m = 0; m < sheet.getLastRowNum(); m )  //  按行循环取行对象
{
HSSFRow row = sheet.getRow(m);
if( row == null){ break;}
text ="\n";
if(row.getLastCellNum() <= 0) break;
text = "-----line:--" m " ---- ,col num:" 
          row.getLastCellNum() "\n";
for(int n = 0; n < row.getLastCellNum(); n ) // 按列循环取单元格对象
{
HSSFCell cell = row.getCell((short)n);

if( cell == null){ break; }
int type = cell.getCellType();
switch(type)
{     case 0:
text =cell.getNumericCellValue() " , \n"; 
break;
     case 1:
     text =cell.getStringCellValue() " , \n"; 
break;
     case 2:
break;
     case 3:
     text = " , \n"; 
break;
 default:
 text ="未知的单元类型" type " , \n";
}
}
}
}
text ="\n";
}
   return text;
}
}