HtmlParser应用范例
/**
*包名:wenc.htmlparser<br>
*文件:Test.java<br>
*作者:wenc<br>
*时间:2007–8–2下午04:03:22<br>
*@version1.0
*/
package wenc.htmlparser;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
import org.htmlparser.visitors.TextExtractingVisitor;
/**
*作用:
*
*@authorwenc
*
*/
publicclass TestHtmlParser {
publicstaticvoid main(String[] args) throws Exception {
/*
* String aFile = "e:/test.htm";
*
* String content = readTextFile(aFile, "GBK");
*/
String content = "<h3>随笔分类<span style=’font-size:11px;font-weight:normal’>"
+ "(78)</span></h3>"
+ "<img src=\"http://www.feedsky.com/images/icon_subscribe_google_fs.gif\"></a>";
test1(content);
System.out.println("1====================================");
test2(content);
System.out.println("2====================================");
test3(content);
System.out.println("3====================================");
test4(content);
System.out.println("4====================================");
// test5(aFile);
System.out.println("5====================================");
}
/**
*读取文件的方式来分析内容.filePath也可以是一个Url.
*
*@paramresource
* 文件/Url
*/
publicstaticvoid test5(String resource) throws Exception {
Parser myParser = new Parser(resource);
// 设置编码
myParser.setEncoding("GBK");
HtmlPage visitor = new HtmlPage(myParser);
myParser.visitAllNodesWith(visitor);
String textInPage = visitor.getTitle();
System.out.println(textInPage);
}
/**
*按页面方式处理.对一个标准的Html页面,推荐使用此种方式.
*/
publicstaticvoid test4(String content) throws Exception {
Parser myParser;
myParser = Parser.createParser(content, "GBK");
HtmlPage visitor = new HtmlPage(myParser);
myParser.visitAllNodesWith(visitor);
String textInPage = visitor.getTitle();
System.out.println(textInPage);
}
/**
*利用Visitor模式解析html页面.
*
*小优点:翻译了<>等符号缺点:好多空格,无法提取link
*
*/
publicstaticvoid test3(String content) throws Exception {
Parser myParser;
myParser = Parser.createParser(content, "GBK");
TextExtractingVisitor visitor = new TextExtractingVisitor();
myParser.visitAllNodesWith(visitor);
String textInPage = visitor.getExtractedText();
System.out.println(textInPage);
}
/**
*得到普通文本和链接的内容.
*
*使用了过滤条件.
*/
publicstaticvoid test2(String content) throws ParserException {
Parser myParser;
NodeList nodeList = null;
myParser = Parser.createParser(content, "GBK");
NodeFilter textFilter = new NodeClassFilter(TextNode.class);
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
// 暂时不处理 meta
// NodeFilter metaFilter = new NodeClassFilter(MetaTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter });
nodeList = myParser.parse(lastFilter);
Node[] nodes = nodeList.toNodeArray();
for (int i = 0; i < nodes.length; i++) {
Node anode = (Node) nodes[i];
String line = "";
if (anode instanceof TextNode) {
TextNode textnode = (TextNode) anode;
// line = textnode.toPlainTextString().trim();
line = textnode.getText();
} elseif (anode instanceof LinkTag) {
LinkTag linknode = (LinkTag) anode;
line = linknode.getLink();
// @todo 过滤jsp标签:可以自己实现这个函数
// line = StringFunc.replace(line, "<%.*%>", "");
}
if (isTrimEmpty(line))
continue;
System.out.println(line);
}
}
/**
*解析普通文本节点.
*
*@paramcontent
*@throwsParserException
*@throwsUnsupportedEncodingException
*/
publicstaticvoid test1(String content) throws ParserException, UnsupportedEncodingException {
StringBuffer text = new StringBuffer();
Parser parser = Parser.createParser(content,"GBK");
// 遍历所有的节点
NodeList nodes = parser.extractAllNodesThatMatch(new NodeFilter() {
publicboolean accept(Node node) {
returntrue;
}
});
Node node = nodes.elementAt(0);
text.append(node.toPlainTextString());
System.out.println(text.toString());
}
/**
*读取一个文件到字符串里.
*
*@paramsFileName
* 文件名
*@paramsEncode
* String
*@return文件内容
*/
publicstatic String readTextFile(String sFileName, String sEncode) {
StringBuffer sbStr = new StringBuffer();
try {
File ff = new File(sFileName);
InputStreamReader read = new InputStreamReader(new FileInputStream(
ff), sEncode);
BufferedReader ins = new BufferedReader(read);
String dataLine = "";
while (null != (dataLine = ins.readLine())) {
sbStr.append(dataLine);
sbStr.append("\r\n");
}
ins.close();
} catch (Exception e) {
System.out.println("read Text File Error" + e.getMessage());
}
return sbStr.toString();
}
/**
*去掉左右空格后字符串是否为空
*
*@paramastr
* String
*@returnboolean
*/
publicstaticboolean isTrimEmpty(String astr) {
if ((null == astr) || (astr.length() == 0)) {
returntrue;
}
if (isBlank(astr.trim())) {
returntrue;
}
returnfalse;
}
/**
*字符串是否为空:null或者长度为0.
*
*@paramastr
* 源字符串.
*@returnboolean
*/
publicstaticboolean isBlank(String astr) {
if ((null == astr) || (astr.length() == 0)) {
returntrue;
} else {
returnfalse;
}
}
}
分类: java 2,916 次阅读
原文链接:http://www.wenhq.com/article/view_102.html欢迎转载,请注明出处:亲亲宝宝